|
24 | 24 | import shutil |
25 | 25 | from hashlib import sha256 |
26 | 26 | from pathlib import Path |
27 | | -from typing import Any, Dict, List, Optional, Tuple, Union |
| 27 | +from typing import Dict, List, Optional, Tuple, Union |
28 | 28 | from urllib.parse import urlparse |
29 | 29 |
|
30 | | -import numpy as np |
31 | | -import pyarrow as pa |
32 | 30 | from filelock import FileLock |
33 | 31 |
|
34 | 32 | from .arrow_dataset import Dataset |
35 | 33 | from .builder import DatasetBuilder |
36 | 34 | from .dataset_dict import DatasetDict |
37 | 35 | from .features import Features |
38 | | -from .fingerprint import update_fingerprint |
39 | | -from .info import DATASET_INFOS_DICT_FILE_NAME, DatasetInfo |
| 36 | +from .info import DATASET_INFOS_DICT_FILE_NAME |
40 | 37 | from .metric import Metric |
41 | 38 | from .splits import Split |
42 | 39 | from .utils.download_manager import GenerateMode |
@@ -560,110 +557,3 @@ def load_dataset( |
560 | 557 | builder_instance._save_infos() |
561 | 558 |
|
562 | 559 | return ds |
563 | | - |
564 | | - |
565 | | -def concatenate_datasets( |
566 | | - dsets: List["Dataset"], |
567 | | - info: Optional[Any] = None, |
568 | | - split: Optional[Any] = None, |
569 | | -): |
570 | | - """ |
571 | | - Converts a list of :obj:``nlp.Dataset`` with the same schema into a single :obj:``nlp.Dataset``. |
572 | | -
|
573 | | - Args: |
574 | | - dsets (:obj:``List[nlp.Dataset]``): A list of Datasets to concatenate |
575 | | - info (:obj:``nlp.DatasetInfo``, `optional`, defaults to :obj:``None``): If specified, the dataset info containing info like |
576 | | - description, citation, etc. |
577 | | - split (:obj:``nlp.NamedSplit``, `optional`, defaults to :obj:``None``): If specified, the name of the dataset split. |
578 | | - """ |
579 | | - if not all([dset.features.type == dsets[0].features.type for dset in dsets]): |
580 | | - raise ValueError("Features must match for all datasets") |
581 | | - |
582 | | - # Datasets tables should all come from disk or memory, but not a mix |
583 | | - |
584 | | - dsets_in_memory = [not dset._data_files for dset in dsets] |
585 | | - if any(dset_in_memory != dsets_in_memory[0] for dset_in_memory in dsets_in_memory): |
586 | | - raise ValueError( |
587 | | - "Datasets should ALL come from memory, or should ALL come from disk.\n" |
588 | | - "However datasets {} come from memory and datasets {} come from disk.".format( |
589 | | - [i for i in range(len(dsets)) if dsets_in_memory[i]], |
590 | | - [i for i in range(len(dsets)) if not dsets_in_memory[i]], |
591 | | - ) |
592 | | - ) |
593 | | - |
594 | | - # Concatenate tables |
595 | | - |
596 | | - table = pa.concat_tables([dset._data for dset in dsets]) |
597 | | - data_files = [f for dset in dsets for f in dset._data_files] |
598 | | - inplace_history = [h for dset in dsets for h in dset._inplace_history] |
599 | | - |
600 | | - def apply_offset_to_indices_table(table, offset): |
601 | | - if offset == 0: |
602 | | - return table |
603 | | - else: |
604 | | - array = table["indices"] |
605 | | - if isinstance(array, pa.ChunkedArray): |
606 | | - new_array = pa.array(np.concatenate([c.to_numpy() for c in array.chunks]) + offset, pa.uint64()) |
607 | | - else: |
608 | | - new_array = pa.array(array.to_numpy() + offset, pa.uint64()) |
609 | | - return pa.Table.from_arrays([new_array], names=["indices"]) |
610 | | - |
611 | | - # Concatenate indices if they exist |
612 | | - |
613 | | - if any(dset._indices is not None for dset in dsets): |
614 | | - |
615 | | - # Datasets indices tables should all come from disk or memory, but not a mix |
616 | | - # Datasets with no indices tables are replaced with a dataset with an indicies table in memory |
617 | | - |
618 | | - indices_mappings_in_memory = [not dset._indices_data_files for dset in dsets] |
619 | | - if any( |
620 | | - indices_mapping_in_memory != indices_mappings_in_memory[0] |
621 | | - for indices_mapping_in_memory in indices_mappings_in_memory |
622 | | - ): |
623 | | - raise ValueError( |
624 | | - "Datasets' indices should ALL come from memory, or should ALL come from disk.\n" |
625 | | - "However datasets' indices {} come from memory and datasets' indices {} come from disk.".format( |
626 | | - [i for i in range(len(dsets)) if indices_mappings_in_memory[i]], |
627 | | - [i for i in range(len(dsets)) if not indices_mappings_in_memory[i]], |
628 | | - ) |
629 | | - ) |
630 | | - indices_in_memory = indices_mappings_in_memory[0] |
631 | | - |
632 | | - # Create missing indices tables in memory |
633 | | - |
634 | | - if indices_in_memory: |
635 | | - for i in range(len(dsets)): |
636 | | - if dsets[i]._indices is None: |
637 | | - dsets[i] = dsets[i].select(range(len(dsets[i]))) |
638 | | - assert all(dset._indices is not None for dset in dsets), "each dataset should have an indices table" |
639 | | - |
640 | | - # An offset needs to be applied to the indices before concatenating |
641 | | - |
642 | | - indices_tables = [] |
643 | | - offset = 0 |
644 | | - for dset in dsets: |
645 | | - indices_tables.append(apply_offset_to_indices_table(dset._indices, offset)) |
646 | | - offset += len(dset._data) |
647 | | - |
648 | | - # Concatenate indices |
649 | | - |
650 | | - indices_table = pa.concat_tables(indices_tables) |
651 | | - indices_data_files = None if indices_in_memory else [f for dset in dsets for f in dset._indices_data_files] |
652 | | - else: |
653 | | - indices_table = None |
654 | | - indices_data_files = None |
655 | | - if info is None: |
656 | | - info = DatasetInfo.from_merge([dset.info for dset in dsets]) |
657 | | - fingerprint = update_fingerprint( |
658 | | - "".join(dset._fingerprint for dset in dsets), concatenate_datasets, {"info": info, "split": split} |
659 | | - ) |
660 | | - return Dataset( |
661 | | - table, |
662 | | - info=info, |
663 | | - split=split, |
664 | | - data_files=data_files, |
665 | | - indices_table=indices_table, |
666 | | - indices_data_files=indices_data_files, |
667 | | - fingerprint=fingerprint, |
668 | | - inplace_history=inplace_history, |
669 | | - ) |
0 commit comments