Speed up Tokenization by optimizing cast_to_python_objects (#523)

lhoestq · web-flow · commit 12a32b921b1f · 2020-08-24T10:54:13.000+02:00
* optimisations for cast_to_python_objects

* fix has_changed for empty list

* add tests for cast_to_python_objects

* remove map_all_sequences_to_list
diff --git a/src/nlp/arrow_writer.py b/src/nlp/arrow_writer.py
@@ -29,7 +29,6 @@
 from .features import Features
 from .info import DatasetInfo
 from .utils.file_utils import HF_DATASETS_CACHE, hash_url_to_filename
-from .utils.py_utils import map_all_sequences_to_lists
 
 
 logger = logging.getLogger(__name__)
@@ -170,7 +169,6 @@ def write(self, example: Dict[str, Any], writer_batch_size: Optional[int] = None
         Args:
             example: the Example to add.
         """
-        example = map_all_sequences_to_lists(example)
         self.current_rows.append(example)
         self._num_examples += 1
         if writer_batch_size is None:
@@ -186,7 +184,6 @@ def write_batch(
         Args:
             example: the Example to add.
         """
-        batch_examples = map_all_sequences_to_lists(batch_examples)
         if self.pa_writer is None:
             self._build_writer(inferred_schema=pa.Table.from_pydict(batch_examples).schema)
         pa_table: pa.Table = pa.Table.from_pydict(batch_examples, schema=self._schema)
diff --git a/src/nlp/features.py b/src/nlp/features.py
@@ -38,7 +38,7 @@
     import tensorflow as tf
 
 
-def string_to_arrow(type_str: str):
+def string_to_arrow(type_str: str) -> pa.DataType:
     if type_str not in pa.__dict__:
         if str(type_str + "_") not in pa.__dict__:
             raise ValueError(
@@ -53,22 +53,75 @@ def string_to_arrow(type_str: str):
     return pa.__dict__[arrow_data_type_str]()
 
 
-def _cast_to_python_objects(obj):
-    """ Cast numpy/pytorch/tensorflow/pandas objects to python lists. """
+def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]:
+    """
+    Cast numpy/pytorch/tensorflow/pandas objects to python lists.
+    It works recursively.
+
+    To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be casted.
+    If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.
+    This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.
+
+    Args:
+        obj: the object (nested struct) to cast
+
+    Returns:
+        casted_obj: the casted object
+        has_changed (bool): True if the object has been changed, False if it is identical
+    """
     if isinstance(obj, np.ndarray):
-        return obj.tolist()
+        return obj.tolist(), True
     elif _torch_available and isinstance(obj, torch.Tensor):
-        return obj.detach().cpu().numpy().tolist()
+        return obj.detach().cpu().numpy().tolist(), True
     elif _tf_available and isinstance(obj, tf.Tensor):
-        return obj.numpy().tolist()
+        return obj.numpy().tolist(), True
+    elif isinstance(obj, pd.Series):
+        return obj.values.tolist(), True
     elif isinstance(obj, pd.DataFrame):
-        return obj.values.tolist()
+        return obj.to_dict("list"), True
+    elif isinstance(obj, dict):
+        output = {}
+        has_changed = False
+        for k, v in obj.items():
+            casted_v, has_changed_v = _cast_to_python_objects(v)
+            has_changed |= has_changed_v
+            output[k] = casted_v
+        return output if has_changed else obj, has_changed
+    elif isinstance(obj, (list, tuple)):
+        if len(obj) > 0:
+            for first_elmt in obj:
+                if first_elmt is not None:
+                    break
+            casted_first_elmt, has_changed_first_elmt = _cast_to_python_objects(first_elmt)
+            if has_changed_first_elmt:
+                return [_cast_to_python_objects(elmt)[0] for elmt in obj], True
+            else:
+                if isinstance(obj, list):
+                    return obj, False
+                else:
+                    return list(obj), True
+        else:
+            return obj if isinstance(obj, list) else [], isinstance(obj, tuple)
     else:
-        return obj
+        return obj, False
+
+
+def cast_to_python_objects(obj: Any) -> Any:
+    """
+    Cast numpy/pytorch/tensorflow/pandas objects to python lists.
+    It works recursively.
+
+    To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be casted.
+    If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.
+    This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.
 
+    Args:
+        obj: the object (nested struct) to cast
 
-def cast_to_python_objects(obj):
-    return utils.map_nested(_cast_to_python_objects, obj, map_list=True, map_tuple=True, map_numpy=False)
+    Returns:
+        casted_obj: the casted object
+    """
+    return _cast_to_python_objects(obj)[0]
 
 
 @dataclass
diff --git a/src/nlp/utils/py_utils.py b/src/nlp/utils/py_utils.py
@@ -41,17 +41,6 @@
 memoize = functools.lru_cache
 
 
-def map_all_sequences_to_lists(data_struct):
-    # Could add support for more exotic data_struct, like OrderedDict
-    def sequences_to_list(seq):
-        if isinstance(seq, (tuple, np.ndarray)):
-            return list(seq)
-        else:
-            return seq
-
-    return map_nested(sequences_to_list, data_struct)
-
-
 def size_str(size_in_bytes):
     """Returns a human readable size string.
 
diff --git a/tests/test_features.py b/tests/test_features.py
@@ -1,7 +1,13 @@
 from unittest import TestCase
+from unittest.mock import patch
+
+import numpy as np
+import pandas as pd
 
 from nlp.arrow_dataset import Dataset
-from nlp.features import Features, Sequence, Value
+from nlp.features import Features, Sequence, Value, _cast_to_python_objects, cast_to_python_objects
+
+from .utils import require_tf, require_torch
 
 
 class FeaturesTest(TestCase):
@@ -24,3 +30,66 @@ def test_from_arrow_schema_with_sequence(self):
         self.assertEqual(original_features.type, new_features.type)
         self.assertDictEqual(dset[0], new_dset[0])
         self.assertDictEqual(dset[:], new_dset[:])
+
+    def test_cast_to_python_objects_list(self):
+        obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    def test_cast_to_python_objects_tuple(self):
+        obj = {"col_1": [{"vec": (1, 2, 3), "txt": "foo"}] * 3, "col_2": [(1, 2), (3, 4), (5, 6)]}
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    def test_cast_to_python_objects_numpy(self):
+        obj = {"col_1": [{"vec": np.arange(1, 4), "txt": "foo"}] * 3, "col_2": np.arange(1, 7).reshape(3, 2)}
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    def test_cast_to_python_objects_series(self):
+        obj = {
+            "col_1": pd.Series([{"vec": [1, 2, 3], "txt": "foo"}] * 3),
+            "col_2": pd.Series([[1, 2], [3, 4], [5, 6]]),
+        }
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    def test_cast_to_python_objects_dataframe(self):
+        obj = pd.DataFrame({"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]})
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    @require_torch
+    def test_cast_to_python_objects_torch(self):
+        import torch
+
+        obj = {
+            "col_1": [{"vec": torch.Tensor(np.arange(1, 4)), "txt": "foo"}] * 3,
+            "col_2": torch.Tensor(np.arange(1, 7).reshape(3, 2)),
+        }
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    @require_tf
+    def test_cast_to_python_objects_tf(self):
+        import tensorflow as tf
+
+        obj = {
+            "col_1": [{"vec": tf.constant(np.arange(1, 4)), "txt": "foo"}] * 3,
+            "col_2": tf.constant(np.arange(1, 7).reshape(3, 2)),
+        }
+        expected_obj = {"col_1": [{"vec": [1, 2, 3], "txt": "foo"}] * 3, "col_2": [[1, 2], [3, 4], [5, 6]]}
+        casted_obj = cast_to_python_objects(obj)
+        self.assertDictEqual(casted_obj, expected_obj)
+
+    @patch("nlp.features._cast_to_python_objects", side_effect=_cast_to_python_objects)
+    def test_dont_iterate_over_each_element_in_a_list(self, mocked_cast):
+        obj = {"col_1": [[1, 2], [3, 4], [5, 6]]}
+        cast_to_python_objects(obj)
+        self.assertEqual(mocked_cast.call_count, 4)  # 4 = depth of obj