updates and clean up on metrics class

thomwolf · thomwolf · commit 97dce4f079c7 · 2020-09-09T12:04:49.000+02:00
diff --git a/docs/source/using_metrics.rst b/docs/source/using_metrics.rst
@@ -49,4 +49,123 @@ Adding model predictions and references can be done using either one of the :fun
 
 The model predictions and references can be provided in a wide number of formats (python lists, numpy arrays, pytorch tensors, tensorflow tensors), the metric object will take care of converting them to a suitable format for temporary storage and computation (as well as bringing them back to cpu and detaching them from gradients for PyTorch tensors).
 
-The exact format of the inputs is specific to each metric script and can be read in the 
+The exact format of the inputs is specific to each metric script and can be found in :obj:`nlp.Metric.features`, :obj:`nlp.Metric.inputs_descriptions` and the string representation of the :class:`nlp.Metric` object:
+
+.. code-block::
+
+    >>> import nlp
+
+    >>> metric = nlp.load_metric('./metrics/sacrebleu')
+
+    >>> print(metric)
+    Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
+        Produces BLEU scores along with its sufficient statistics
+        from a source against one or more references.
+
+        Args:
+            predictions: The system stream (a sequence of segments)
+            references: A list of one or more reference streams (each a sequence of segments)
+            smooth: The smoothing method to use
+            smooth_value: For 'floor' smoothing, the floor to use
+            force: Ignore data that looks already tokenized
+            lowercase: Lowercase the data
+            tokenize: The tokenizer to use
+        Returns:
+            'score': BLEU score,
+            'counts': Counts,
+            'totals': Totals,
+            'precisions': Precisions,
+            'bp': Brevity penalty,
+            'sys_len': predictions length,
+            'ref_len': reference length,
+        """)
+
+    >>> print(metric.features)
+    {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
+
+    >>> print(metric.inputs_description)
+
+    Produces BLEU scores along with its sufficient statistics
+    from a source against one or more references.
+
+    Args:
+        predictions: The system stream (a sequence of segments)
+        references: A list of one or more reference streams (each a sequence of segments)
+        smooth: The smoothing method to use
+        smooth_value: For 'floor' smoothing, the floor to use
+        force: Ignore data that looks already tokenized
+        lowercase: Lowercase the data
+        tokenize: The tokenizer to use
+    Returns:
+        'score': BLEU score,
+        'counts': Counts,
+        'totals': Totals,
+        'precisions': Precisions,
+        'bp': Brevity penalty,
+        'sys_len': predictions length,
+        'ref_len': reference length,
+
+Here we can see that the ``sacrebleu`` metric expect a sequence of segments as predictions and a list of one or several sequences of segments as references.
+
+You can find more information on the segments in the description, homepage and publication of ``sacrebleu`` which can be access with the respective attributes on the metric:
+
+.. code-block::
+    >>> print(metric.description)
+    SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores.
+    Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text.
+    It also knows all the standard test sets and handles downloading, processing, and tokenization for you.
+
+    See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information.
+
+    >>> print(metric.homepage)
+    https://github.com/mjpost/sacreBLEU
+    >>> print(metric.citation)
+    @inproceedings{post-2018-call,
+        title = "A Call for Clarity in Reporting {BLEU} Scores",
+        author = "Post, Matt",
+        booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+        month = oct,
+        year = "2018",
+        address = "Belgium, Brussels",
+        publisher = "Association for Computational Linguistics",
+        url = "https://www.aclweb.org/anthology/W18-6319",
+        pages = "186--191",
+    }
+
+Let's use ``sacrebleu`` with the official quick-start example on its homepage at https://github.com/mjpost/sacreBLEU:
+
+.. code-block::
+
+    >>> reference_batch = [['The dog bit the man.', 'The dog had bit the man.'],
+    ...                    ['It was not unexpected.', 'No one was surprised.'],
+    ...                    ['The man bit him first.', 'The man had bitten the dog.']]
+    >>> sys_batch = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
+    >>> score = metric.add_batch(predictions=sys_batch, references=reference_batch)
+    >>> print(metric)
+    Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
+    Produces BLEU scores along with its sufficient statistics
+    from a source against one or more references.
+
+    Args:
+        predictions: The system stream (a sequence of segments)
+        references: A list of one or more reference streams (each a sequence of segments)
+        smooth: The smoothing method to use
+        smooth_value: For 'floor' smoothing, the floor to use
+        force: Ignore data that looks already tokenized
+        lowercase: Lowercase the data
+        tokenize: The tokenizer to use
+    Returns:
+        'score': BLEU score,
+        'counts': Counts,
+        'totals': Totals,
+        'precisions': Precisions,
+        'bp': Brevity penalty,
+        'sys_len': predictions length,
+        'ref_len': reference length,
+    """, stored examples: 3)
+
+We have stored three evaluation examples in our metric, now let's compute the score.
+
+Conmputing the metric scores
+-----------------------------------------
+
diff --git a/src/nlp/arrow_writer.py b/src/nlp/arrow_writer.py
@@ -169,6 +169,10 @@ def __init__(
         self.current_rows = []
         self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
 
+    def __len__(self):
+        """ Return the number of writed and staged examples """
+        return self._num_examples + len(self.current_rows)
+
     def _build_writer(self, inferred_schema: pa.Schema):
         inferred_features = Features.from_arrow_schema(inferred_schema)
         if self._features is not None:
diff --git a/src/nlp/load.py b/src/nlp/load.py
@@ -200,6 +200,7 @@ def get_imports(file_path: str):
 def prepare_module(
     path: str,
     download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[GenerateMode] = None,
     dataset: bool = True,
     force_local_path: Optional[str] = None,
     **download_kwargs,
@@ -335,6 +336,9 @@ def prepare_module(
     lock_path = local_path + ".lock"
     with FileLock(lock_path):
         # Create main dataset/metrics folder if needed
+        if download_mode == GenerateMode.FORCE_REDOWNLOAD and os.path.exists(main_folder_path):
+            shutil.rmtree(main_folder_path)
+
         if not os.path.exists(main_folder_path):
             logger.info(f"Creating main folder for {module_type} {file_path} at {main_folder_path}")
             os.makedirs(main_folder_path, exist_ok=True)
@@ -428,6 +432,7 @@ def load_metric(
     experiment_id: Optional[str] = None,
     keep_in_memory: bool = False,
     download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[GenerateMode] = None,
     **metric_init_kwargs,
 ) -> Metric:
     r"""Load a `nlp.Metric`.
@@ -446,12 +451,13 @@ def load_metric(
         cache_dir (Optional str): path to store the temporary predictions and references (default to `~/.nlp/`)
         keep_in_memory (bool): Weither to store the temporary results in memory (defaults to False)
         download_config (Optional ``nlp.DownloadConfig``: specific download configuration parameters.
+        download_mode (Optional `nlp.GenerateMode`): select the download/generate mode - Default to REUSE_DATASET_IF_EXISTS
         experiment_id (``str``): A specific experiment id. This is used if several distributed evaluations share the same file system.
             This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
 
     Returns: `nlp.Metric`.
     """
-    module_path, hash = prepare_module(path, download_config=download_config, dataset=False)
+    module_path, hash = prepare_module(path, download_config=download_config, download_mode=download_mode, dataset=False)
     metric_cls = import_main_class(module_path, dataset=False)
     metric = metric_cls(
         config_name=config_name,
@@ -538,7 +544,7 @@ def load_dataset(
     """
     ignore_verifications = ignore_verifications or save_infos
     # Download/copy dataset processing script
-    module_path, hash = prepare_module(path, download_config=download_config, dataset=True)
+    module_path, hash = prepare_module(path, download_config=download_config, download_mode=download_mode, dataset=True)
 
     # Get dataset builder class from the processing script
     builder_cls = import_main_class(module_path, dataset=True)
diff --git a/src/nlp/metric.py b/src/nlp/metric.py
@@ -183,7 +183,9 @@ def __init__(
         self.filelocks = None
 
     def __repr__(self):
-        return f'Metric(name: "{self.name}", features: {self.features}, usage: """{self.inputs_description}""")'
+        return (f'Metric(name: "{self.name}", features: {self.features}, '
+                f'usage: """{self.inputs_description}""", '
+                f'stored examples: {0 if self.writer is None else len(self.writer)})')
 
     def _build_data_dir(self):
         """Path of this metric in cache_dir:
@@ -344,15 +346,27 @@ def add_batch(self, *, predictions=None, references=None):
         batch = self.info.features.encode_batch(batch)
         if self.writer is None:
             self._init_writer()
-        self.writer.write_batch(batch)
+        try:
+            self.writer.write_batch(batch)
+        except pa.ArrowInvalid:
+            raise ValueError(f"Predictions and/or references don't match the expected format.\n"
+                            f"Expected format: {self.features},\n"
+                            f"Input predictions: {predictions},\n"
+                            f"Input references: {references}")
 
     def add(self, *, prediction=None, reference=None):
         """Add one prediction and reference for the metric's stack."""
         example = {"predictions": prediction, "references": reference}
         example = self.info.features.encode_example(example)
         if self.writer is None:
             self._init_writer()
-        self.writer.write(example)
+        try:
+            self.writer.write(example)
+        except pa.ArrowInvalid:
+            raise ValueError(f"Prediction and/or reference don't match the expected format.\n"
+                            f"Expected format: {self.features},\n"
+                            f"Input predictions: {prediction},\n"
+                            f"Input references: {reference}")
 
     def _init_writer(self):
         if self.keep_in_memory: