|
27 | 27 | from typing import Dict, List, Optional, Union |
28 | 28 |
|
29 | 29 | import xxhash |
| 30 | +from filelock import FileLock |
30 | 31 |
|
31 | 32 | from . import utils |
32 | 33 | from .arrow_dataset import Dataset |
@@ -391,101 +392,104 @@ def download_and_prepare( |
391 | 392 | dataset_name=self.name, download_config=download_config, data_dir=self.config.data_dir |
392 | 393 | ) |
393 | 394 |
|
394 | | - data_exists = os.path.exists(self._cache_dir) |
395 | | - if data_exists and download_mode == REUSE_DATASET_IF_EXISTS: |
396 | | - logger.info("Reusing dataset %s (%s)", self.name, self._cache_dir) |
397 | | - self.download_post_processing_resources(dl_manager) |
398 | | - return |
399 | | - |
400 | | - # Currently it's not possible to overwrite the data because it would |
401 | | - # conflict with versioning: If the last version has already been generated, |
402 | | - # it will always be reloaded and cache_dir will be set at construction. |
403 | | - if data_exists and download_mode != REUSE_CACHE_IF_EXISTS: |
404 | | - raise ValueError( |
405 | | - "Trying to overwrite an existing dataset {} at {}. A dataset with " |
406 | | - "the same version {} already exists. If the dataset has changed, " |
407 | | - "please update the version number.".format(self.name, self._cache_dir, self.config.version) |
408 | | - ) |
| 395 | + # Prevent parallel disk operations |
| 396 | + lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace("/", "_") + ".lock") |
| 397 | + with FileLock(lock_path): |
| 398 | + data_exists = os.path.exists(self._cache_dir) |
| 399 | + if data_exists and download_mode == REUSE_DATASET_IF_EXISTS: |
| 400 | + logger.info("Reusing dataset %s (%s)", self.name, self._cache_dir) |
| 401 | + self.download_post_processing_resources(dl_manager) |
| 402 | + return |
| 403 | + |
| 404 | + # Currently it's not possible to overwrite the data because it would |
| 405 | + # conflict with versioning: If the last version has already been generated, |
| 406 | + # it will always be reloaded and cache_dir will be set at construction. |
| 407 | + if data_exists and download_mode != REUSE_CACHE_IF_EXISTS: |
| 408 | + raise ValueError( |
| 409 | + "Trying to overwrite an existing dataset {} at {}. A dataset with " |
| 410 | + "the same version {} already exists. If the dataset has changed, " |
| 411 | + "please update the version number.".format(self.name, self._cache_dir, self.config.version) |
| 412 | + ) |
409 | 413 |
|
410 | | - logger.info("Generating dataset %s (%s)", self.name, self._cache_dir) |
411 | | - if not is_remote_url(self._cache_dir): # if cache dir is local, check for available space |
412 | | - os.makedirs(self._cache_dir_root, exist_ok=True) |
413 | | - if not utils.has_sufficient_disk_space(self.info.size_in_bytes or 0, directory=self._cache_dir_root): |
414 | | - raise IOError( |
415 | | - "Not enough disk space. Needed: {} (download: {}, generated: {}, post-processed: {})".format( |
416 | | - utils.size_str(self.info.size_in_bytes or 0), |
417 | | - utils.size_str(self.info.download_size or 0), |
418 | | - utils.size_str(self.info.dataset_size or 0), |
419 | | - utils.size_str(self.info.post_processing_size or 0), |
| 414 | + logger.info("Generating dataset %s (%s)", self.name, self._cache_dir) |
| 415 | + if not is_remote_url(self._cache_dir): # if cache dir is local, check for available space |
| 416 | + os.makedirs(self._cache_dir_root, exist_ok=True) |
| 417 | + if not utils.has_sufficient_disk_space(self.info.size_in_bytes or 0, directory=self._cache_dir_root): |
| 418 | + raise IOError( |
| 419 | + "Not enough disk space. Needed: {} (download: {}, generated: {}, post-processed: {})".format( |
| 420 | + utils.size_str(self.info.size_in_bytes or 0), |
| 421 | + utils.size_str(self.info.download_size or 0), |
| 422 | + utils.size_str(self.info.dataset_size or 0), |
| 423 | + utils.size_str(self.info.post_processing_size or 0), |
| 424 | + ) |
420 | 425 | ) |
| 426 | + |
| 427 | + @contextlib.contextmanager |
| 428 | + def incomplete_dir(dirname): |
| 429 | + """Create temporary dir for dirname and rename on exit.""" |
| 430 | + if is_remote_url(dirname): |
| 431 | + yield dirname |
| 432 | + else: |
| 433 | + tmp_dir = dirname + ".incomplete" |
| 434 | + os.makedirs(tmp_dir) |
| 435 | + try: |
| 436 | + yield tmp_dir |
| 437 | + if os.path.isdir(dirname): |
| 438 | + shutil.rmtree(dirname) |
| 439 | + os.rename(tmp_dir, dirname) |
| 440 | + finally: |
| 441 | + if os.path.exists(tmp_dir): |
| 442 | + shutil.rmtree(tmp_dir) |
| 443 | + |
| 444 | + # Print is intentional: we want this to always go to stdout so user has |
| 445 | + # information needed to cancel download/preparation if needed. |
| 446 | + # This comes right before the progress bar. |
| 447 | + print( |
| 448 | + f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} " |
| 449 | + f"(download: {utils.size_str(self.info.download_size)}, generated: {utils.size_str(self.info.dataset_size)}, " |
| 450 | + f"post-processed: {utils.size_str(self.info.post_processing_size)}, " |
| 451 | + f"total: {utils.size_str(self.info.size_in_bytes)}) to {self._cache_dir}..." |
| 452 | + ) |
| 453 | + |
| 454 | + if self.manual_download_instructions is not None: |
| 455 | + assert ( |
| 456 | + dl_manager.manual_dir is not None |
| 457 | + ), "The dataset {} with config {} requires manual data. \n Please follow the manual download instructions: {}. \n Manual data can be loaded with `nlp.load_dataset({}, data_dir='<path/to/manual/data>')".format( |
| 458 | + self.name, self.config.name, self.manual_download_instructions, self.name |
421 | 459 | ) |
422 | 460 |
|
423 | | - @contextlib.contextmanager |
424 | | - def incomplete_dir(dirname): |
425 | | - """Create temporary dir for dirname and rename on exit.""" |
426 | | - if is_remote_url(dirname): |
427 | | - yield dirname |
428 | | - else: |
429 | | - tmp_dir = dirname + ".incomplete" |
430 | | - os.makedirs(tmp_dir) |
431 | | - try: |
432 | | - yield tmp_dir |
433 | | - if os.path.isdir(dirname): |
434 | | - shutil.rmtree(dirname) |
435 | | - os.rename(tmp_dir, dirname) |
436 | | - finally: |
437 | | - if os.path.exists(tmp_dir): |
438 | | - shutil.rmtree(tmp_dir) |
439 | | - |
440 | | - # Print is intentional: we want this to always go to stdout so user has |
441 | | - # information needed to cancel download/preparation if needed. |
442 | | - # This comes right before the progress bar. |
443 | | - print( |
444 | | - f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} " |
445 | | - f"(download: {utils.size_str(self.info.download_size)}, generated: {utils.size_str(self.info.dataset_size)}, " |
446 | | - f"post-processed: {utils.size_str(self.info.post_processing_size)}, " |
447 | | - f"total: {utils.size_str(self.info.size_in_bytes)}) to {self._cache_dir}..." |
448 | | - ) |
| 461 | + # Create a tmp dir and rename to self._cache_dir on successful exit. |
| 462 | + with incomplete_dir(self._cache_dir) as tmp_data_dir: |
| 463 | + # Temporarily assign _cache_dir to tmp_data_dir to avoid having to forward |
| 464 | + # it to every sub function. |
| 465 | + with utils.temporary_assignment(self, "_cache_dir", tmp_data_dir): |
| 466 | + # Try to download the already prepared dataset files |
| 467 | + downloaded_from_gcs = False |
| 468 | + if try_from_hf_gcs: |
| 469 | + try: |
| 470 | + self._download_prepared_from_hf_gcs() |
| 471 | + downloaded_from_gcs = True |
| 472 | + except (DatasetNotOnHfGcs, MissingFilesOnHfGcs): |
| 473 | + logger.info("Dataset not on Hf google storage. Downloading and preparing it from source") |
| 474 | + if not downloaded_from_gcs: |
| 475 | + self._download_and_prepare( |
| 476 | + dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs |
| 477 | + ) |
| 478 | + # Sync info |
| 479 | + self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values()) |
| 480 | + self.info.download_checksums = dl_manager.get_recorded_sizes_checksums() |
| 481 | + self.info.size_in_bytes = self.info.dataset_size + self.info.download_size |
| 482 | + # Save info |
| 483 | + self._save_info() |
| 484 | + |
| 485 | + # Download post processing resources |
| 486 | + self.download_post_processing_resources(dl_manager) |
449 | 487 |
|
450 | | - if self.manual_download_instructions is not None: |
451 | | - assert ( |
452 | | - dl_manager.manual_dir is not None |
453 | | - ), "The dataset {} with config {} requires manual data. \n Please follow the manual download instructions: {}. \n Manual data can be loaded with `nlp.load_dataset({}, data_dir='<path/to/manual/data>')".format( |
454 | | - self.name, self.config.name, self.manual_download_instructions, self.name |
| 488 | + print( |
| 489 | + f"Dataset {self.name} downloaded and prepared to {self._cache_dir}. " |
| 490 | + f"Subsequent calls will reuse this data." |
455 | 491 | ) |
456 | 492 |
|
457 | | - # Create a tmp dir and rename to self._cache_dir on successful exit. |
458 | | - with incomplete_dir(self._cache_dir) as tmp_data_dir: |
459 | | - # Temporarily assign _cache_dir to tmp_data_dir to avoid having to forward |
460 | | - # it to every sub function. |
461 | | - with utils.temporary_assignment(self, "_cache_dir", tmp_data_dir): |
462 | | - # Try to download the already prepared dataset files |
463 | | - downloaded_from_gcs = False |
464 | | - if try_from_hf_gcs: |
465 | | - try: |
466 | | - self._download_prepared_from_hf_gcs() |
467 | | - downloaded_from_gcs = True |
468 | | - except (DatasetNotOnHfGcs, MissingFilesOnHfGcs): |
469 | | - logger.info("Dataset not on Hf google storage. Downloading and preparing it from source") |
470 | | - if not downloaded_from_gcs: |
471 | | - self._download_and_prepare( |
472 | | - dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs |
473 | | - ) |
474 | | - # Sync info |
475 | | - self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values()) |
476 | | - self.info.download_checksums = dl_manager.get_recorded_sizes_checksums() |
477 | | - self.info.size_in_bytes = self.info.dataset_size + self.info.download_size |
478 | | - # Save info |
479 | | - self._save_info() |
480 | | - |
481 | | - # Download post processing resources |
482 | | - self.download_post_processing_resources(dl_manager) |
483 | | - |
484 | | - print( |
485 | | - f"Dataset {self.name} downloaded and prepared to {self._cache_dir}. " |
486 | | - f"Subsequent calls will reuse this data." |
487 | | - ) |
488 | | - |
489 | 493 | def _download_prepared_from_hf_gcs(self): |
490 | 494 | relative_data_dir = self._relative_data_dir(with_version=True, with_hash=False) |
491 | 495 | reader = ArrowReader(self._cache_dir, self.info) |
|
0 commit comments