Skip to content

Commit acfc3ac

Browse files
authored
Better message when data files is empty (#586)
* better error message for empty data_files * allow writer to write empty tables * raise ValueError instead
1 parent 6b07c88 commit acfc3ac

File tree

6 files changed

+17
-2
lines changed

6 files changed

+17
-2
lines changed

datasets/csv/csv.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ def _info(self):
5656
def _split_generators(self, dl_manager):
5757
""" We handle string, list and dicts in datafiles
5858
"""
59+
if not self.config.data_files:
60+
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
5961
data_files = dl_manager.download_and_extract(self.config.data_files)
6062
if isinstance(data_files, (str, list, tuple)):
6163
files = data_files

datasets/json/json.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ def _info(self):
4343
def _split_generators(self, dl_manager):
4444
""" We handle string, list and dicts in datafiles
4545
"""
46+
if not self.config.data_files:
47+
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
4648
data_files = dl_manager.download_and_extract(self.config.data_files)
4749
if isinstance(data_files, (str, list, tuple)):
4850
files = data_files

datasets/pandas/pandas.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ def _info(self):
1313
def _split_generators(self, dl_manager):
1414
""" We handle string, list and dicts in datafiles
1515
"""
16+
if not self.config.data_files:
17+
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
1618
data_files = dl_manager.download_and_extract(self.config.data_files)
1719
if isinstance(data_files, (str, list, tuple)):
1820
files = data_files

datasets/text/text.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ def _split_generators(self, dl_manager):
7575
If str or List[str], then the dataset returns only the 'train' split.
7676
If dict, then keys should be from the `nlp.Split` enum.
7777
"""
78+
if not self.config.data_files:
79+
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
7880
data_files = dl_manager.download_and_extract(self.config.data_files)
7981
if isinstance(data_files, (str, list, tuple)):
8082
files = data_files

src/nlp/arrow_reader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ def _get_dataset_from_filename(self, filename_skip_take):
288288
mmap = pa.memory_map(filename)
289289
f = pa.ipc.open_stream(mmap)
290290
pa_table = f.read_all()
291-
if skip is not None and take is not None:
291+
# here we don't want to slice an empty table, or it may segfault
292+
if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
292293
pa_table = pa_table.slice(skip, take)
293294
return pa_table
294295

@@ -317,7 +318,8 @@ def _get_dataset_from_filename(self, filename_skip_take):
317318
filename_skip_take["take"] if "take" in filename_skip_take else None,
318319
)
319320
pa_table = pa.parquet.read_table(filename, memory_map=True)
320-
if skip is not None and take is not None:
321+
# here we don't want to slice an empty table, or it may segfault
322+
if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
321323
pa_table = pa_table.slice(skip, take)
322324
return pa_table
323325

src/nlp/arrow_writer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,11 @@ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = Non
288288

289289
def finalize(self, close_stream=True):
290290
self.write_on_file()
291+
if self.pa_writer is None:
292+
if self._schema is not None:
293+
self._build_writer(self._schema)
294+
else:
295+
raise ValueError("Please pass `features` or at least one example when writing data")
291296
self.pa_writer.close()
292297
if close_stream:
293298
self.stream.close()

0 commit comments

Comments
 (0)