huggingface · thomwolf · Sep 8, 2020 · Aug 31, 2020 · Sep 1, 2020 · Sep 1, 2020
diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -15,7 +15,6 @@ jobs:
           pip install --upgrade pip
           pip install setuptools wheel
           pip install -e .[benchmarks]
-          pip install -e .
 
           # pyarrow==0.17.1
           pip install pyarrow==0.17.1
@@ -44,4 +43,4 @@ jobs:
           cat report.md >> final_report.md
           echo "\n</details>" >> final_report.md
 
-          cml-send-comment final_report.md
+          cml-send-comment final_report.md
diff --git a/datasets/text/dummy/0.0.0/dummy_data.zip b/datasets/text/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/text/text.py b/datasets/text/text.py
@@ -1,9 +1,72 @@
+from dataclasses import dataclass
+from typing import List
+
+import pyarrow.csv as pac
+
 import nlp
 
 
-class Text(nlp.GeneratorBasedBuilder):
+logger = nlp.utils.logging.get_logger(__name__)
+
+FEATURES = nlp.Features({"text": nlp.Value("string"),})
+
+@dataclass
+class TextConfig(nlp.BuilderConfig):
+    """BuilderConfig for text files."""
+
+    encoding: str = None
+    block_size: int = None
+    use_threads: bool = None
+    read_options: pac.ReadOptions = None
+    parse_options: pac.ParseOptions = None
+    convert_options: pac.ConvertOptions = None
+
+    @property
+    def pa_read_options(self):
+        if self.read_options is not None:
+            read_options = self.read_options
+        else:
+            read_options = pac.ReadOptions(
+                column_names=['text'])
+        if self.encoding is not None:
+            read_options.encoding = self.encoding
+        if self.block_size is not None:
+            read_options.block_size = self.block_size
+        if self.use_threads is not None:
+            read_options.use_threads = self.use_threads
+        return read_options
+
+    @property
+    def pa_parse_options(self):
+        if self.parse_options is not None:
+            parse_options = self.parse_options
+        else:
+            parse_options = pac.ParseOptions(
+                delimiter='\r',
+                quote_char=False,
+                double_quote=False,
+                escape_char=False,
+                newlines_in_values=False,
+                ignore_empty_lines=False,
+                )
+        return parse_options
+
+    @property
+    def pa_convert_options(self):
+        if self.convert_options is not None:
+            convert_options = self.convert_options
+        else:
+            convert_options = pac.ConvertOptions(
+                column_types=FEATURES.type,
+                )
+        return convert_options
+
+
+class Text(nlp.ArrowBasedBuilder):
+    BUILDER_CONFIG_CLASS = TextConfig
+
     def _info(self):
-        return nlp.DatasetInfo(features=nlp.Features({"text": nlp.Value("string"),}))
+        return nlp.DatasetInfo(features=FEATURES)
 
     def _split_generators(self, dl_manager):
         """ The `datafiles` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].
@@ -26,11 +89,15 @@ def _split_generators(self, dl_manager):
                 splits.append(nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
         return splits
 
-    def _generate_examples(self, files):
-        """ Read files sequentially, then lines sequentially. """
-        idx = 0
-        for filename in files:
-            with open(filename, encoding="utf-8") as file:
-                for line in file:
-                    yield idx, {"text": line}
-                    idx += 1
+    def _generate_tables(self, files):
+        for i, file in enumerate(files):
+            pa_table = pac.read_csv(
+                file,
+                read_options=self.config.pa_read_options,
+                parse_options=self.config.pa_parse_options,
+                convert_options=self.config.convert_options,
+            )
+            # Uncomment for debugging (will print the Arrow table size and elements)
+            # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
+            # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
+            yield i, pa_table
diff --git a/setup.py b/setup.py
@@ -73,9 +73,10 @@
 ]
 
 BENCHMARKS_REQUIRE = [
-    'tensorflow',
-    'torch',
-    'transformers',
+    'numpy==1.18.5',
+    'tensorflow==2.3.0',
+    'torch==1.6.0',
+    'transformers==3.0.2',
 ]
 
 TESTS_REQUIRE = [