Skip to content

Download is broken for dict of dicts: FileNotFoundError #6869

@albertvillanova

Description

@albertvillanova

It seems there is a bug when downloading a dict of dicts of URLs introduced by:

Steps to reproduce the bug:

from datasets import DownloadManager

dl_manager = DownloadManager()
paths = dl_manager.download({"train": {"frr": "hf://datasets/wikimedia/wikipedia/20231101.frr/train-00000-of-00001.parquet"}})

Stack trace:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-7-0e0d76d25b09> in <module>
----> 1 paths = dl_manager.download({"train": {"frr": "hf://datasets/wikimedia/wikipedia/20231101.frr/train-00000-of-00001.parquet"}})

.../huggingface/datasets/src/datasets/download/download_manager.py in download(self, url_or_urls)
    255         start_time = datetime.now()
    256         with stack_multiprocessing_download_progress_bars():
--> 257             downloaded_path_or_paths = map_nested(
    258                 download_func,
    259                 url_or_urls,

.../huggingface/datasets/src/datasets/utils/py_utils.py in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, batched, batch_size, types, disable_tqdm, desc)
    506                 batch_size = max(len(iterable) // num_proc + int(len(iterable) % num_proc > 0), 1)
    507             iterable = list(iter_batched(iterable, batch_size))
--> 508         mapped = [
    509             _single_map_nested((function, obj, batched, batch_size, types, None, True, None))
    510             for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)

.../huggingface/datasets/src/datasets/utils/py_utils.py in <listcomp>(.0)
    507             iterable = list(iter_batched(iterable, batch_size))
    508         mapped = [
--> 509             _single_map_nested((function, obj, batched, batch_size, types, None, True, None))
    510             for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
    511         ]

.../huggingface/datasets/src/datasets/utils/py_utils.py in _single_map_nested(args)
    375         and all(not isinstance(v, types) for v in data_struct)
    376     ):
--> 377         return [mapped_item for batch in iter_batched(data_struct, batch_size) for mapped_item in function(batch)]
    378 
    379     # Reduce logging to keep things readable in multiprocessing with tqdm

.../huggingface/datasets/src/datasets/utils/py_utils.py in <listcomp>(.0)
    375         and all(not isinstance(v, types) for v in data_struct)
    376     ):
--> 377         return [mapped_item for batch in iter_batched(data_struct, batch_size) for mapped_item in function(batch)]
    378 
    379     # Reduce logging to keep things readable in multiprocessing with tqdm

.../huggingface/datasets/src/datasets/download/download_manager.py in _download_batched(self, url_or_filenames, download_config)
    311             )
    312         else:
--> 313             return [
    314                 self._download_single(url_or_filename, download_config=download_config)
    315                 for url_or_filename in url_or_filenames

.../huggingface/datasets/src/datasets/download/download_manager.py in <listcomp>(.0)
    312         else:
    313             return [
--> 314                 self._download_single(url_or_filename, download_config=download_config)
    315                 for url_or_filename in url_or_filenames
    316             ]

.../huggingface/datasets/src/datasets/download/download_manager.py in _download_single(self, url_or_filename, download_config)
    321             # append the relative path to the base_path
    322             url_or_filename = url_or_path_join(self._base_path, url_or_filename)
--> 323         out = cached_path(url_or_filename, download_config=download_config)
    324         out = tracked_str(out)
    325         out.set_origin(url_or_filename)

.../huggingface/datasets/src/datasets/utils/file_utils.py in cached_path(url_or_filename, download_config, **download_kwargs)
    220     elif is_local_path(url_or_filename):
    221         # File, but it doesn't exist.
--> 222         raise FileNotFoundError(f"Local file {url_or_filename} doesn't exist")
    223     else:
    224         # Something unknown

FileNotFoundError: Local file .../huggingface/datasets/{'frr': 'hf:/datasets/wikimedia/wikipedia/20231101.frr/train-00000-of-00001.parquet'} doesn't exist

Related to:

Metadata

Metadata

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions