Skip to content

Commit 13f18e3

Browse files
Support ndjson data files (#7154)
* Test .ndjson files * Support .ndjson files * Add comment about format no longer being maintained
1 parent d70c902 commit 13f18e3

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

src/datasets/packaged_modules/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ def _hash_python_lines(lines: List[str]) -> str:
6161
".tsv": ("csv", {"sep": "\t"}),
6262
".json": ("json", {}),
6363
".jsonl": ("json", {}),
64+
# ndjson is no longer maintained (see: https://github.com/ndjson/ndjson-spec/issues/35#issuecomment-1285673417)
65+
".ndjson": ("json", {}),
6466
".parquet": ("parquet", {}),
6567
".geoparquet": ("parquet", {}),
6668
".gpq": ("parquet", {}),

tests/packaged_modules/test_json.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,22 @@ def jsonl_file(tmp_path):
2424
return str(filename)
2525

2626

27+
# ndjson format is no longer maintained (see: https://github.com/ndjson/ndjson-spec/issues/35#issuecomment-1285673417)
28+
@pytest.fixture
29+
def ndjson_file(tmp_path):
30+
filename = tmp_path / "file.ndjson"
31+
data = textwrap.dedent(
32+
"""\
33+
{"col_1": -1}
34+
{"col_1": 1, "col_2": 2}
35+
{"col_1": 10, "col_2": 20}
36+
"""
37+
)
38+
with open(filename, "w") as f:
39+
f.write(data)
40+
return str(filename)
41+
42+
2743
@pytest.fixture
2844
def jsonl_file_utf16_encoded(tmp_path):
2945
filename = tmp_path / "file_utf16_encoded.jsonl"
@@ -188,6 +204,7 @@ def test_config_raises_when_invalid_data_files(data_files) -> None:
188204
"file_fixture, config_kwargs",
189205
[
190206
("jsonl_file", {}),
207+
("ndjson_file", {}),
191208
("jsonl_file_utf16_encoded", {"encoding": "utf-16"}),
192209
("json_file_with_list_of_dicts", {}),
193210
("json_file_with_list_of_dicts_field", {"field": "field3"}),

0 commit comments

Comments
 (0)