[Distributed] Fix load_dataset error when multiprocessing + add test (#544)

thomwolf · web-flow · commit 7ce446d28830 · 2020-08-31T13:15:10.000+02:00
* Fix #543 + add test * fix tests * make csv/json/pandas/text able to access distant data
diff --git a/datasets/csv/csv.py b/datasets/csv/csv.py
@@ -56,15 +56,16 @@ def _info(self):
     def _split_generators(self, dl_manager):
         """ We handle string, list and dicts in datafiles
         """
-        if isinstance(self.config.data_files, (str, list, tuple)):
-            files = self.config.data_files
+        data_files = dl_manager.download_and_extract(self.config.data_files)
+        if isinstance(data_files, (str, list, tuple)):
+            files = data_files
             if isinstance(files, str):
                 files = [files]
             return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files})]
         splits = []
         for split_name in [nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST]:
-            if split_name in self.config.data_files:
-                files = self.config.data_files[split_name]
+            if split_name in data_files:
+                files = data_files[split_name]
                 if isinstance(files, str):
                     files = [files]
                 splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
diff --git a/datasets/json/json.py b/datasets/json/json.py
@@ -43,15 +43,16 @@ def _info(self):
     def _split_generators(self, dl_manager):
         """ We handle string, list and dicts in datafiles
         """
-        if isinstance(self.config.data_files, (str, list, tuple)):
-            files = self.config.data_files
+        data_files = dl_manager.download_and_extract(self.config.data_files)
+        if isinstance(data_files, (str, list, tuple)):
+            files = data_files
             if isinstance(files, str):
                 files = [files]
             return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files})]
         splits = []
         for split_name in [nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST]:
-            if split_name in self.config.data_files:
-                files = self.config.data_files[split_name]
+            if split_name in data_files:
+                files = data_files[split_name]
                 if isinstance(files, str):
                     files = [files]
                 splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
diff --git a/datasets/pandas/pandas.py b/datasets/pandas/pandas.py
@@ -13,15 +13,16 @@ def _info(self):
     def _split_generators(self, dl_manager):
         """ We handle string, list and dicts in datafiles
         """
-        if isinstance(self.config.data_files, (str, list, tuple)):
-            files = self.config.data_files
+        data_files = dl_manager.download_and_extract(self.config.data_files)
+        if isinstance(data_files, (str, list, tuple)):
+            files = data_files
             if isinstance(files, str):
                 files = [files]
             return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files})]
         splits = []
         for split_name in [nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST]:
-            if split_name in self.config.data_files:
-                files = self.config.data_files[split_name]
+            if split_name in data_files:
+                files = data_files[split_name]
                 if isinstance(files, str):
                     files = [files]
                 splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
diff --git a/datasets/text/text.py b/datasets/text/text.py
@@ -11,22 +11,20 @@ def _split_generators(self, dl_manager):
             If str or List[str], then the dataset returns only the 'train' split.
             If dict, then keys should be from the `nlp.Split` enum.
         """
-        if isinstance(self.config.data_files, (str, list, tuple)):
-            # Handle case with only one split
-            files = self.config.data_files
+        data_files = dl_manager.download_and_extract(self.config.data_files)
+        if isinstance(data_files, (str, list, tuple)):
+            files = data_files
             if isinstance(files, str):
                 files = [files]
             return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files})]
-        else:
-            # Handle case with several splits and a dict mapping
-            splits = []
-            for split_name in [nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST]:
-                if split_name in self.config.data_files:
-                    files = self.config.data_files[split_name]
-                    if isinstance(files, str):
-                        files = [files]
-                    splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
-            return splits
+        splits = []
+        for split_name in [nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST]:
+            if split_name in data_files:
+                files = data_files[split_name]
+                if isinstance(files, str):
+                    files = [files]
+                splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
+        return splits
 
     def _generate_examples(self, files):
         """ Read files sequentially, then lines sequentially. """
diff --git a/src/nlp/builder.py b/src/nlp/builder.py
@@ -27,6 +27,7 @@
 from typing import Dict, List, Optional, Union
 
 import xxhash
+from filelock import FileLock
 
 from . import utils
 from .arrow_dataset import Dataset
@@ -391,101 +392,104 @@ def download_and_prepare(
                 dataset_name=self.name, download_config=download_config, data_dir=self.config.data_dir
             )
 
-        data_exists = os.path.exists(self._cache_dir)
-        if data_exists and download_mode == REUSE_DATASET_IF_EXISTS:
-            logger.info("Reusing dataset %s (%s)", self.name, self._cache_dir)
-            self.download_post_processing_resources(dl_manager)
-            return
-
-        # Currently it's not possible to overwrite the data because it would
-        # conflict with versioning: If the last version has already been generated,
-        # it will always be reloaded and cache_dir will be set at construction.
-        if data_exists and download_mode != REUSE_CACHE_IF_EXISTS:
-            raise ValueError(
-                "Trying to overwrite an existing dataset {} at {}. A dataset with "
-                "the same version {} already exists. If the dataset has changed, "
-                "please update the version number.".format(self.name, self._cache_dir, self.config.version)
-            )
+        # Prevent parallel disk operations
+        lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace("/", "_") + ".lock")
+        with FileLock(lock_path):
+            data_exists = os.path.exists(self._cache_dir)
+            if data_exists and download_mode == REUSE_DATASET_IF_EXISTS:
+                logger.info("Reusing dataset %s (%s)", self.name, self._cache_dir)
+                self.download_post_processing_resources(dl_manager)
+                return
+
+            # Currently it's not possible to overwrite the data because it would
+            # conflict with versioning: If the last version has already been generated,
+            # it will always be reloaded and cache_dir will be set at construction.
+            if data_exists and download_mode != REUSE_CACHE_IF_EXISTS:
+                raise ValueError(
+                    "Trying to overwrite an existing dataset {} at {}. A dataset with "
+                    "the same version {} already exists. If the dataset has changed, "
+                    "please update the version number.".format(self.name, self._cache_dir, self.config.version)
+                )
 
-        logger.info("Generating dataset %s (%s)", self.name, self._cache_dir)
-        if not is_remote_url(self._cache_dir):  # if cache dir is local, check for available space
-            os.makedirs(self._cache_dir_root, exist_ok=True)
-            if not utils.has_sufficient_disk_space(self.info.size_in_bytes or 0, directory=self._cache_dir_root):
-                raise IOError(
-                    "Not enough disk space. Needed: {} (download: {}, generated: {}, post-processed: {})".format(
-                        utils.size_str(self.info.size_in_bytes or 0),
-                        utils.size_str(self.info.download_size or 0),
-                        utils.size_str(self.info.dataset_size or 0),
-                        utils.size_str(self.info.post_processing_size or 0),
+            logger.info("Generating dataset %s (%s)", self.name, self._cache_dir)
+            if not is_remote_url(self._cache_dir):  # if cache dir is local, check for available space
+                os.makedirs(self._cache_dir_root, exist_ok=True)
+                if not utils.has_sufficient_disk_space(self.info.size_in_bytes or 0, directory=self._cache_dir_root):
+                    raise IOError(
+                        "Not enough disk space. Needed: {} (download: {}, generated: {}, post-processed: {})".format(
+                            utils.size_str(self.info.size_in_bytes or 0),
+                            utils.size_str(self.info.download_size or 0),
+                            utils.size_str(self.info.dataset_size or 0),
+                            utils.size_str(self.info.post_processing_size or 0),
+                        )
                     )
+
+            @contextlib.contextmanager
+            def incomplete_dir(dirname):
+                """Create temporary dir for dirname and rename on exit."""
+                if is_remote_url(dirname):
+                    yield dirname
+                else:
+                    tmp_dir = dirname + ".incomplete"
+                    os.makedirs(tmp_dir)
+                    try:
+                        yield tmp_dir
+                        if os.path.isdir(dirname):
+                            shutil.rmtree(dirname)
+                        os.rename(tmp_dir, dirname)
+                    finally:
+                        if os.path.exists(tmp_dir):
+                            shutil.rmtree(tmp_dir)
+
+            # Print is intentional: we want this to always go to stdout so user has
+            # information needed to cancel download/preparation if needed.
+            # This comes right before the progress bar.
+            print(
+                f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} "
+                f"(download: {utils.size_str(self.info.download_size)}, generated: {utils.size_str(self.info.dataset_size)}, "
+                f"post-processed: {utils.size_str(self.info.post_processing_size)}, "
+                f"total: {utils.size_str(self.info.size_in_bytes)}) to {self._cache_dir}..."
+            )
+
+            if self.manual_download_instructions is not None:
+                assert (
+                    dl_manager.manual_dir is not None
+                ), "The dataset {} with config {} requires manual data. \n Please follow the manual download instructions: {}. \n Manual data can be loaded with `nlp.load_dataset({}, data_dir='<path/to/manual/data>')".format(
+                    self.name, self.config.name, self.manual_download_instructions, self.name
                 )
 
-        @contextlib.contextmanager
-        def incomplete_dir(dirname):
-            """Create temporary dir for dirname and rename on exit."""
-            if is_remote_url(dirname):
-                yield dirname
-            else:
-                tmp_dir = dirname + ".incomplete"
-                os.makedirs(tmp_dir)
-                try:
-                    yield tmp_dir
-                    if os.path.isdir(dirname):
-                        shutil.rmtree(dirname)
-                    os.rename(tmp_dir, dirname)
-                finally:
-                    if os.path.exists(tmp_dir):
-                        shutil.rmtree(tmp_dir)
-
-        # Print is intentional: we want this to always go to stdout so user has
-        # information needed to cancel download/preparation if needed.
-        # This comes right before the progress bar.
-        print(
-            f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} "
-            f"(download: {utils.size_str(self.info.download_size)}, generated: {utils.size_str(self.info.dataset_size)}, "
-            f"post-processed: {utils.size_str(self.info.post_processing_size)}, "
-            f"total: {utils.size_str(self.info.size_in_bytes)}) to {self._cache_dir}..."
-        )
+            # Create a tmp dir and rename to self._cache_dir on successful exit.
+            with incomplete_dir(self._cache_dir) as tmp_data_dir:
+                # Temporarily assign _cache_dir to tmp_data_dir to avoid having to forward
+                # it to every sub function.
+                with utils.temporary_assignment(self, "_cache_dir", tmp_data_dir):
+                    # Try to download the already prepared dataset files
+                    downloaded_from_gcs = False
+                    if try_from_hf_gcs:
+                        try:
+                            self._download_prepared_from_hf_gcs()
+                            downloaded_from_gcs = True
+                        except (DatasetNotOnHfGcs, MissingFilesOnHfGcs):
+                            logger.info("Dataset not on Hf google storage. Downloading and preparing it from source")
+                    if not downloaded_from_gcs:
+                        self._download_and_prepare(
+                            dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
+                        )
+                    # Sync info
+                    self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
+                    self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
+                    self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
+                    # Save info
+                    self._save_info()
+
+            # Download post processing resources
+            self.download_post_processing_resources(dl_manager)
 
-        if self.manual_download_instructions is not None:
-            assert (
-                dl_manager.manual_dir is not None
-            ), "The dataset {} with config {} requires manual data. \n Please follow the manual download instructions: {}. \n Manual data can be loaded with `nlp.load_dataset({}, data_dir='<path/to/manual/data>')".format(
-                self.name, self.config.name, self.manual_download_instructions, self.name
+            print(
+                f"Dataset {self.name} downloaded and prepared to {self._cache_dir}. "
+                f"Subsequent calls will reuse this data."
             )
 
-        # Create a tmp dir and rename to self._cache_dir on successful exit.
-        with incomplete_dir(self._cache_dir) as tmp_data_dir:
-            # Temporarily assign _cache_dir to tmp_data_dir to avoid having to forward
-            # it to every sub function.
-            with utils.temporary_assignment(self, "_cache_dir", tmp_data_dir):
-                # Try to download the already prepared dataset files
-                downloaded_from_gcs = False
-                if try_from_hf_gcs:
-                    try:
-                        self._download_prepared_from_hf_gcs()
-                        downloaded_from_gcs = True
-                    except (DatasetNotOnHfGcs, MissingFilesOnHfGcs):
-                        logger.info("Dataset not on Hf google storage. Downloading and preparing it from source")
-                if not downloaded_from_gcs:
-                    self._download_and_prepare(
-                        dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
-                    )
-                # Sync info
-                self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
-                self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
-                self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
-                # Save info
-                self._save_info()
-
-        # Download post processing resources
-        self.download_post_processing_resources(dl_manager)
-
-        print(
-            f"Dataset {self.name} downloaded and prepared to {self._cache_dir}. "
-            f"Subsequent calls will reuse this data."
-        )
-
     def _download_prepared_from_hf_gcs(self):
         relative_data_dir = self._relative_data_dir(with_version=True, with_hash=False)
         reader = ArrowReader(self._cache_dir, self.info)
diff --git a/tests/test_dataset_common.py b/tests/test_dataset_common.py
@@ -17,6 +17,8 @@
 import logging
 import os
 import tempfile
+from multiprocessing import Pool
+from unittest import TestCase
 
 import requests
 from absl.testing import parameterized
@@ -27,6 +29,7 @@
     DownloadConfig,
     GenerateMode,
     MockDownloadManager,
+    cached_path,
     hf_api,
     hf_bucket_url,
     import_main_class,
@@ -42,7 +45,7 @@
 
 class DatasetTester(object):
     def __init__(self, parent):
-        self.parent = parent
+        self.parent = parent if parent is not None else TestCase()
 
     def load_builder_class(self, dataset_name, is_local=False):
         # Download/copy dataset script
@@ -219,6 +222,33 @@ def test_load_real_dataset_all_configs(self, dataset_name):
                     self.assertTrue(len(dataset[split]) > 0)
 
 
+def distributed_load_dataset(args):
+    data_name, tmp_dir, datafiles = args
+    dataset = load_dataset(data_name, cache_dir=tmp_dir, data_files=datafiles)
+    return dataset
+
+
+class DistributedDatasetTest(TestCase):
+    def test_load_dataset_distributed(self):
+        num_workers = 5
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            data_name = "./datasets/csv"
+            data_base_path = os.path.join(data_name, "dummy/0.0.0/dummy_data.zip")
+            local_path = cached_path(
+                data_base_path, cache_dir=tmp_dir, extract_compressed_file=True, force_extract=True
+            )
+            datafiles = {
+                "train": os.path.join(local_path, "dummy_data/train.csv"),
+                "dev": os.path.join(local_path, "dummy_data/dev.csv"),
+                "test": os.path.join(local_path, "dummy_data/test.csv"),
+            }
+            args = data_name, tmp_dir, datafiles
+            with Pool(processes=num_workers) as pool:  # start num_workers processes
+                result = pool.apply_async(distributed_load_dataset, (args,))
+                _ = result.get(timeout=20)
+                _ = pool.map(distributed_load_dataset, [args] * num_workers)
+
+
 def get_aws_dataset_names():
     api = hf_api.HfApi()
     # fetch all dataset names