Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ jobs:
pip install --upgrade pip
pip install setuptools wheel
pip install -e .[benchmarks]
pip install -e .

# pyarrow==0.17.1
pip install pyarrow==0.17.1
Expand Down Expand Up @@ -44,4 +43,4 @@ jobs:
cat report.md >> final_report.md
echo "\n</details>" >> final_report.md

cml-send-comment final_report.md
cml-send-comment final_report.md
Binary file modified datasets/text/dummy/0.0.0/dummy_data.zip
Binary file not shown.
87 changes: 77 additions & 10 deletions datasets/text/text.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,72 @@
from dataclasses import dataclass
from typing import List

import pyarrow.csv as pac

import nlp


class Text(nlp.GeneratorBasedBuilder):
logger = nlp.utils.logging.get_logger(__name__)

FEATURES = nlp.Features({"text": nlp.Value("string"),})

@dataclass
class TextConfig(nlp.BuilderConfig):
"""BuilderConfig for text files."""

encoding: str = None
block_size: int = None
use_threads: bool = None
read_options: pac.ReadOptions = None
parse_options: pac.ParseOptions = None
convert_options: pac.ConvertOptions = None

@property
def pa_read_options(self):
if self.read_options is not None:
read_options = self.read_options
else:
read_options = pac.ReadOptions(
column_names=['text'])
if self.encoding is not None:
read_options.encoding = self.encoding
if self.block_size is not None:
read_options.block_size = self.block_size
if self.use_threads is not None:
read_options.use_threads = self.use_threads
return read_options

@property
def pa_parse_options(self):
if self.parse_options is not None:
parse_options = self.parse_options
else:
parse_options = pac.ParseOptions(
delimiter='\r',
quote_char=False,
double_quote=False,
escape_char=False,
newlines_in_values=False,
ignore_empty_lines=False,
)
return parse_options

@property
def pa_convert_options(self):
if self.convert_options is not None:
convert_options = self.convert_options
else:
convert_options = pac.ConvertOptions(
column_types=FEATURES.type,
)
return convert_options


class Text(nlp.ArrowBasedBuilder):
BUILDER_CONFIG_CLASS = TextConfig

def _info(self):
return nlp.DatasetInfo(features=nlp.Features({"text": nlp.Value("string"),}))
return nlp.DatasetInfo(features=FEATURES)

def _split_generators(self, dl_manager):
""" The `datafiles` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].
Expand All @@ -26,11 +89,15 @@ def _split_generators(self, dl_manager):
splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
return splits

def _generate_examples(self, files):
""" Read files sequentially, then lines sequentially. """
idx = 0
for filename in files:
with open(filename, encoding="utf-8") as file:
for line in file:
yield idx, {"text": line}
idx += 1
def _generate_tables(self, files):
for i, file in enumerate(files):
pa_table = pac.read_csv(
file,
read_options=self.config.pa_read_options,
parse_options=self.config.pa_parse_options,
convert_options=self.config.convert_options,
)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield i, pa_table
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@
]

BENCHMARKS_REQUIRE = [
'tensorflow',
'torch',
'transformers',
'numpy==1.18.5',
'tensorflow==2.3.0',
'torch==1.6.0',
'transformers==3.0.2',
]

TESTS_REQUIRE = [
Expand Down