Skip to content

Commit 2eb4edb

Browse files
Support JSON lines with missing struct fields (#7160)
* Test cast_array_to_features with struct with missing fields * Support cast_array_to_features with struct with missing fields
1 parent 13f18e3 commit 2eb4edb

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

src/datasets/table.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2000,10 +2000,14 @@ def cast_array_to_feature(
20002000
sequence_kwargs = vars(feature).copy()
20012001
feature = sequence_kwargs.pop("feature")
20022002
feature = {name: Sequence(subfeature, **sequence_kwargs) for name, subfeature in feature.items()}
2003-
if isinstance(feature, dict) and {field.name for field in array.type} == set(feature):
2003+
if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature):
20042004
if array.type.num_fields == 0:
20052005
return array
2006-
arrays = [_c(array.field(name), subfeature) for name, subfeature in feature.items()]
2006+
null_array = pa.array([None] * len(array))
2007+
arrays = [
2008+
_c(array.field(name) if name in array_fields else null_array, subfeature)
2009+
for name, subfeature in feature.items()
2010+
]
20072011
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
20082012
elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
20092013
# feature must be either [subfeature] or LargeList(subfeature) or Sequence(subfeature)

tests/test_table.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,14 @@ def test_cast_decimal_array_to_features():
11421142
cast_array_to_feature(arr, Sequence(Value("string")), allow_decimal_to_str=False)
11431143

11441144

1145+
def test_cast_array_to_features_with_struct_with_missing_fields():
1146+
arr = pa.array([{"age": 25}, {"age": 63}])
1147+
feature = {"age": Value("int32"), "name": Value("string")}
1148+
cast_array = cast_array_to_feature(arr, feature)
1149+
assert cast_array.type == pa.struct({"age": pa.int32(), "name": pa.string()})
1150+
assert cast_array.to_pylist() == [{"age": 25, "name": None}, {"age": 63, "name": None}]
1151+
1152+
11451153
def test_cast_array_to_features_nested():
11461154
arr = pa.array([[{"foo": [0]}]])
11471155
assert cast_array_to_feature(arr, [{"foo": Sequence(Value("string"))}]).type == pa.list_(

0 commit comments

Comments
 (0)