Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions docs/source/about_dataset_features.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,21 @@ Refer to [`Value`] for a full list of supported data types.

The [`ClassLabel`] feature informs 🤗 Datasets the `label` column contains two classes. The classes are labeled `not_equivalent` and `equivalent`. Labels are stored as integers in the dataset. When you retrieve the labels, [`ClassLabel.int2str`] and [`ClassLabel.str2int`] carries out the conversion from integer value to label name, and vice versa.

If your data type contains a list of objects, then you want to use the [`Sequence`] feature. Remember the SQuAD dataset?
If your data type contains a list of objects, then you want to use the [`List`] feature. Remember the SQuAD dataset?

```py
>>> from datasets import load_dataset
>>> dataset = load_dataset('rajpurkar/squad', split='train')
>>> dataset.features
{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1),
'context': Value(dtype='string'),
'id': Value(dtype='string'),
'question': Value(dtype='string'),
'title': Value(dtype='string')}
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
```

The `answers` field is constructed using the [`Sequence`] feature because it contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.
The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.

<Tip>

Expand Down
2 changes: 2 additions & 0 deletions docs/source/package_reference/main_classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable

[[autodoc]] datasets.LargeList

[[autodoc]] datasets.List

[[autodoc]] datasets.Sequence

### Translation
Expand Down
11 changes: 6 additions & 5 deletions docs/source/process.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th
>>> from datasets import load_dataset
>>> dataset = load_dataset("rajpurkar/squad", split="train")
>>> dataset.features
{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1),
'context': Value(dtype='string'),
'id': Value(dtype='string'),
'question': Value(dtype='string'),
'title': Value(dtype='string')}
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
```

The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns:
Expand Down
55 changes: 34 additions & 21 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,12 @@
from .arrow_writer import ArrowWriter, OptimizedTypedSequence
from .data_files import sanitize_patterns
from .download.streaming_download_manager import xgetsize
from .features import Audio, ClassLabel, Features, Image, Sequence, Value, Video
from .features import Audio, ClassLabel, Features, Image, List, Value, Video
from .features.features import (
FeatureType,
_align_features,
_check_if_features_can_be_aligned,
_fix_for_backward_compatible_features,
generate_from_arrow_type,
pandas_types_mapper,
require_decoding,
Expand Down Expand Up @@ -897,6 +898,8 @@ def from_pandas(
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if features is not None:
features = _fix_for_backward_compatible_features(features)
if info is None:
info = DatasetInfo()
info.features = features
Expand Down Expand Up @@ -942,6 +945,8 @@ def from_polars(
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if features is not None:
features = _fix_for_backward_compatible_features(features)
if info is None:
info = DatasetInfo()
info.features = features
Expand Down Expand Up @@ -987,6 +992,8 @@ def from_dict(
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if features is not None:
features = _fix_for_backward_compatible_features(features)
arrow_typed_mapping = {}
for col, data in mapping.items():
if isinstance(data, (pa.Array, pa.ChunkedArray)):
Expand Down Expand Up @@ -1950,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
>>> from datasets import load_dataset
>>> ds = load_dataset("boolq", split="validation")
>>> ds.features
{'answer': Value(dtype='bool', id=None),
'passage': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None)}
{'answer': Value(dtype='bool'),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
>>> ds = ds.class_encode_column('answer')
>>> ds.features
{'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
'passage': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None)}
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
```
"""
# Sanity checks
Expand Down Expand Up @@ -2028,11 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
>>> from datasets import load_dataset
>>> ds = load_dataset("rajpurkar/squad", split="train")
>>> ds.features
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
'context': Value(dtype='string', id=None),
'id': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None)}
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
>>> ds.flatten()
Dataset({
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
Expand Down Expand Up @@ -2100,15 +2108,15 @@ def cast(
>>> from datasets import load_dataset, ClassLabel, Value
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
>>> ds.features
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
>>> new_features = ds.features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds.features
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='large_string')}
```
"""
if sorted(features) != sorted(self._data.column_names):
Expand All @@ -2117,6 +2125,7 @@ def cast(
f"as the columns in the dataset: {self._data.column_names}"
)

features = _fix_for_backward_compatible_features(features)
schema = features.arrow_schema
format = self.format
dataset = self.with_format("arrow")
Expand Down Expand Up @@ -2158,14 +2167,15 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
>>> from datasets import load_dataset, ClassLabel
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
>>> ds.features
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds.features
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='string')}
```
"""
feature = _fix_for_backward_compatible_features(feature)
if hasattr(feature, "decode_example"):
dataset = copy.deepcopy(self)
dataset._info.features[column] = feature
Expand Down Expand Up @@ -3082,6 +3092,9 @@ def map(
if fn_kwargs is None:
fn_kwargs = {}

if features is not None:
features = _fix_for_backward_compatible_features(features)

if num_proc is not None and num_proc > len(self):
num_proc = len(self)
logger.warning(
Expand Down Expand Up @@ -6350,7 +6363,7 @@ def process_label_ids(batch):
features[label_column] = (
ClassLabel(num_classes=len(label_names), names=label_names)
if isinstance(label_feature, ClassLabel)
else Sequence(ClassLabel(num_classes=len(label_names), names=label_names))
else List(ClassLabel(num_classes=len(label_names), names=label_names))
)
return self.map(process_label_ids, features=features, batched=True, desc="Aligning the labels")

Expand Down
4 changes: 2 additions & 2 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('vivos')
>>> ds_builder.get_all_exported_dataset_infos()
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
```
"""
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
Expand All @@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
>>> ds_builder.get_exported_dataset_info()
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
```
"""
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
Expand Down
55 changes: 28 additions & 27 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict":
>>> from datasets import load_dataset
>>> ds = load_dataset("rajpurkar/squad")
>>> ds["train"].features
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
'context': Value(dtype='string', id=None),
'id': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None)}
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers.text': List(feature=Value(dtype='string'), length=-1),
'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)}
>>> ds.flatten()
DatasetDict({
train: Dataset({
Expand Down Expand Up @@ -288,15 +289,15 @@ def cast(self, features: Features) -> "DatasetDict":
>>> from datasets import load_dataset, ClassLabel, Value
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
>>> new_features = ds["train"].features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='large_string')}
```
"""
self._check_values_type()
Expand All @@ -320,12 +321,12 @@ def cast_column(self, column: str, feature) -> "DatasetDict":
>>> from datasets import load_dataset, ClassLabel
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='string')}
```
"""
self._check_values_type()
Expand Down Expand Up @@ -512,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
>>> from datasets import load_dataset
>>> ds = load_dataset("boolq")
>>> ds["train"].features
{'answer': Value(dtype='bool', id=None),
'passage': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None)}
{'answer': Value(dtype='bool'),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
>>> ds = ds.class_encode_column("answer")
>>> ds["train"].features
{'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None),
'passage': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None)}
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
```
"""
self._check_values_type()
Expand Down Expand Up @@ -2379,12 +2380,12 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict
>>> from datasets import load_dataset, ClassLabel
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='string')}
```
"""
return IterableDatasetDict(
Expand Down Expand Up @@ -2415,15 +2416,15 @@ def cast(
>>> from datasets import load_dataset
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
>>> new_features = ds["train"].features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='large_string')}
```
"""
return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})
Expand Down
3 changes: 2 additions & 1 deletion src/datasets/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"ClassLabel",
"Features",
"LargeList",
"List",
"Sequence",
"Value",
"Image",
Expand All @@ -16,7 +17,7 @@
"Pdf",
]
from .audio import Audio
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
from .image import Image
from .pdf import Pdf
from .translation import Translation, TranslationVariableLanguages
Expand Down
Loading
Loading