fix: Fix list aggregates on empty series (#4155)

desmondcheongzx · web-flow · commit 5ca0d8ae07f4 · 2025-04-07T11:06:21.000-07:00
## Changes Made When performing a list agg on an empty partition, we throw `daft.exceptions.DaftCoreException: DaftError::ValueError Need at least 1 series to perform concat`. To fix this, this PR adds a check if the series is empty. In this case we simply return an empty series. ## Related Issues Addresses #4153
diff --git a/src/daft-core/src/array/ops/list.rs b/src/daft-core/src/array/ops/list.rs
@@ -15,7 +15,10 @@ use crate::{
         FixedSizeListArray, ListArray, StructArray,
     },
     count_mode::CountMode,
-    datatypes::{BooleanArray, DataType, Field, Int64Array, UInt64Array, Utf8Array},
+    datatypes::{
+        try_mean_aggregation_supertype, BooleanArray, DataType, Field, Int64Array, UInt64Array,
+        Utf8Array,
+    },
     kernels::search_sorted::build_is_valid,
     prelude::MapArray,
     series::{IntoSeries, Series},
@@ -1035,9 +1038,10 @@ impl FixedSizeListArray {
 macro_rules! impl_aggs_list_array {
     ($la:ident) => {
         impl $la {
-            fn agg_helper<T>(&self, op: T) -> DaftResult<Series>
+            fn agg_helper<T, F>(&self, op: T, target_type_getter: F) -> DaftResult<Series>
             where
                 T: Fn(&Series) -> DaftResult<Series>,
+                F: Fn(&DataType) -> DaftResult<DataType>,
             {
                 // TODO(Kevin): Currently this requires full materialization of one Series for every list. We could avoid this by implementing either sorted aggregation or an array builder
 
@@ -1050,23 +1054,28 @@ macro_rules! impl_aggs_list_array {
 
                 let agg_refs: Vec<_> = aggs.iter().collect();
 
-                Series::concat(agg_refs.as_slice()).map(|s| s.rename(self.name()))
+                if agg_refs.is_empty() {
+                    let target_type = target_type_getter(self.child_data_type())?;
+                    Ok(Series::empty(self.name(), &target_type))
+                } else {
+                    Series::concat(agg_refs.as_slice()).map(|s| s.rename(self.name()))
+                }
             }
 
             pub fn sum(&self) -> DaftResult<Series> {
-                self.agg_helper(|s| s.sum(None))
+                self.agg_helper(|s| s.sum(None), |dtype| Ok(dtype.clone()))
             }
 
             pub fn mean(&self) -> DaftResult<Series> {
-                self.agg_helper(|s| s.mean(None))
+                self.agg_helper(|s| s.mean(None), try_mean_aggregation_supertype)
             }
 
             pub fn min(&self) -> DaftResult<Series> {
-                self.agg_helper(|s| s.min(None))
+                self.agg_helper(|s| s.min(None), |dtype| Ok(dtype.clone()))
             }
 
             pub fn max(&self) -> DaftResult<Series> {
-                self.agg_helper(|s| s.max(None))
+                self.agg_helper(|s| s.max(None), |dtype| Ok(dtype.clone()))
             }
         }
     };
diff --git a/tests/recordbatch/list/test_list_numeric_aggs.py b/tests/recordbatch/list/test_list_numeric_aggs.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import pyarrow as pa
 import pytest
 
+import daft
 from daft.datatype import DataType
 from daft.expressions import col
 from daft.recordbatch import MicroPartition
@@ -33,3 +35,75 @@ def test_list_min(table):
 def test_list_max(table):
     result = table.eval_expression_list([col("a").list.max()])
     assert result.to_pydict() == {"a": [2, 4, 5, None, None]}
+
+
+def test_list_numeric_aggs_empty_table():
+    empty_table = MicroPartition.from_pydict(
+        {
+            "col": pa.array([], type=pa.list_(pa.int64())),
+            "fixed_col": pa.array([], type=pa.list_(pa.int64(), 2)),
+        }
+    )
+
+    result = empty_table.eval_expression_list(
+        [
+            col("col").cast(DataType.list(DataType.int64())).list.sum().alias("col_sum"),
+            col("col").list.mean().alias("col_mean"),
+            col("col").list.min().alias("col_min"),
+            col("col").list.max().alias("col_max"),
+            col("fixed_col").list.sum().alias("fixed_col_sum"),
+            col("fixed_col").list.mean().alias("fixed_col_mean"),
+            col("fixed_col").list.min().alias("fixed_col_min"),
+            col("fixed_col").list.max().alias("fixed_col_max"),
+        ]
+    )
+    assert result.to_pydict() == {
+        "col_sum": [],
+        "col_mean": [],
+        "col_min": [],
+        "col_max": [],
+        "fixed_col_sum": [],
+        "fixed_col_mean": [],
+        "fixed_col_min": [],
+        "fixed_col_max": [],
+    }
+
+
+def test_list_numeric_aggs_with_groupby():
+    df = daft.from_pydict(
+        {
+            "group_col": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
+            "id_col": [3, 1, 2, 2, 5, 4, None, 3, None, None, None, None],
+        }
+    )
+
+    # Group by and test aggregates.
+    grouped_df = df.groupby("group_col").agg(daft.col("id_col").agg_list().alias("ids_col"))
+    result = grouped_df.select(
+        col("group_col"),
+        col("ids_col").list.sum().alias("ids_col_sum"),
+        col("ids_col").list.mean().alias("ids_col_mean"),
+        col("ids_col").list.min().alias("ids_col_min"),
+        col("ids_col").list.max().alias("ids_col_max"),
+    ).sort("group_col", desc=False)
+    result_dict = result.to_pydict()
+    expected = {
+        "group_col": [1, 2, 3],
+        "ids_col_sum": [8, 12, None],
+        "ids_col_mean": [2.0, 4.0, None],
+        "ids_col_min": [1, 3, None],
+        "ids_col_max": [3, 5, None],
+    }
+    assert result_dict == expected
+
+    # Cast to fixed size list, group by, and test aggregates.
+    grouped_df = grouped_df.with_column("ids_col", col("ids_col").cast(DataType.fixed_size_list(DataType.int64(), 4)))
+    result = grouped_df.select(
+        col("group_col"),
+        col("ids_col").list.sum().alias("ids_col_sum"),
+        col("ids_col").list.mean().alias("ids_col_mean"),
+        col("ids_col").list.min().alias("ids_col_min"),
+        col("ids_col").list.max().alias("ids_col_max"),
+    ).sort("group_col", desc=False)
+    result_dict = result.to_pydict()
+    assert result_dict == expected