huggingface · thomwolf · Aug 31, 2020 · Aug 28, 2020 · Aug 31, 2020 · Aug 31, 2020
diff --git a/benchmarks/results/benchmark_array_xd.json b/benchmarks/results/benchmark_array_xd.json
@@ -1 +1 @@
-{"write_array2d": 0.07093274600629229, "read_unformated after write_array2d": 0.03530075500020757, "read_formatted_as_numpy after write_array2d": 0.10929270699853078, "read_batch_unformated after write_array2d": 0.03727920600795187, "read_batch_formatted_as_numpy after write_array2d": 0.018853643006877974, "read_col_unformated after write_array2d": 0.05644163000397384, "read_col_formatted_as_numpy after write_array2d": 0.011610292000113986, "write_nested_sequence": 1.6535991109994939, "read_unformated after write_nested_sequence": 0.3739209540071897, "read_formatted_as_numpy after write_nested_sequence": 0.40762836500653066, "read_batch_unformated after write_nested_sequence": 0.3337586460111197, "read_batch_formatted_as_numpy after write_nested_sequence": 0.054717567007173784, "read_col_unformated after write_nested_sequence": 0.3173944180016406, "read_col_formatted_as_numpy after write_nested_sequence": 0.004956340009812266, "write_flattened_sequence": 1.4975415869994322, "read_unformated after write_flattened_sequence": 0.26713552299770527, "read_formatted_as_numpy after write_flattened_sequence": 0.07673935199272819, "read_batch_unformated after write_flattened_sequence": 0.25450974798877724, "read_batch_formatted_as_numpy after write_flattened_sequence": 0.009374254994327202, "read_col_unformated after write_flattened_sequence": 0.25912448299641255, "read_col_formatted_as_numpy after write_flattened_sequence": 0.004277604995877482}
+{"write_array2d": 0.14168284999323077, "read_unformated after write_array2d": 0.04353281999647152, "read_formatted_as_numpy after write_array2d": 0.1285462469968479, "read_batch_unformated after write_array2d": 0.023109222995117307, "read_batch_formatted_as_numpy after write_array2d": 0.011352884990628809, "read_col_unformated after write_array2d": 0.037052362007671036, "read_col_formatted_as_numpy after write_array2d": 0.007985618998645805, "write_nested_sequence": 1.4927163410029607, "read_unformated after write_nested_sequence": 0.28319963401008863, "read_formatted_as_numpy after write_nested_sequence": 0.419271487990045, "read_batch_unformated after write_nested_sequence": 0.3234798710036557, "read_batch_formatted_as_numpy after write_nested_sequence": 0.03850809299910907, "read_col_unformated after write_nested_sequence": 0.29384092400141526, "read_col_formatted_as_numpy after write_nested_sequence": 0.004250421989127062, "write_flattened_sequence": 1.4521546780015342, "read_unformated after write_flattened_sequence": 0.25513897799828555, "read_formatted_as_numpy after write_flattened_sequence": 0.07564631900459062, "read_batch_unformated after write_flattened_sequence": 0.2758980469952803, "read_batch_formatted_as_numpy after write_flattened_sequence": 0.011008214991306886, "read_col_unformated after write_flattened_sequence": 0.25848906899045687, "read_col_formatted_as_numpy after write_flattened_sequence": 0.004328447001171298}
diff --git a/benchmarks/results/benchmark_indices_mapping.json b/benchmarks/results/benchmark_indices_mapping.json
@@ -1 +1 @@
-{"num examples": 500000, "select": 0.034579699000460096, "sort": 0.5558962740033166, "shuffle": 0.18372017299407162, "train_test_split": 0.29882429300050717, "shard": 0.014594822001527064}
+{"num examples": 500000, "select": 0.03741131999413483, "sort": 0.7371353159978753, "shuffle": 0.17655655200360343, "train_test_split": 0.29633847798686475, "shard": 0.01452581599005498}
diff --git a/benchmarks/results/benchmark_iterating.json b/benchmarks/results/benchmark_iterating.json
@@ -1 +1 @@
-{"num examples": 50000, "read 5000": 0.2146801049966598, "read 50000": 2.115211837008246, "read_batch 50000 10": 1.4512460459955037, "read_batch 50000 100": 1.385236697999062, "read_batch 50000 1000": 1.4181318079936318, "read_formatted numpy 5000": 4.044872473998112, "read_formatted pandas 5000": 3.4310112629900686, "read_formatted torch 5000": 4.470335923993844, "read_formatted tensorflow 5000": 5.384795637000934, "read_formatted_batch numpy 5000 10": 0.4460094000096433, "read_formatted_batch numpy 5000 1000": 0.007665968994842842, "shuffled read 5000": 0.2283045439980924, "shuffled read 50000": 2.2466989499953343, "shuffled read_batch 50000 10": 59.94365781600936, "shuffled read_batch 50000 100": 7.204961794006522, "shuffled read_batch 50000 1000": 2.4927480350015685, "shuffled read_formatted numpy 5000": 4.631365966997691, "shuffled read_formatted_batch numpy 5000 10": 6.5569094810052775, "shuffled read_formatted_batch numpy 5000 1000": 0.06912206900597084}
+{"num examples": 50000, "read 5000": 0.2152090710005723, "read 50000": 2.077654693988734, "read_batch 50000 10": 1.5041199039987987, "read_batch 50000 100": 1.5411947140091797, "read_batch 50000 1000": 1.4684901159926085, "read_formatted numpy 5000": 4.584776938994764, "read_formatted pandas 5000": 3.7457121399929747, "read_formatted torch 5000": 4.565676491998602, "read_formatted tensorflow 5000": 5.269861594992108, "read_formatted_batch numpy 5000 10": 0.4242750950070331, "read_formatted_batch numpy 5000 1000": 0.007607111998368055, "shuffled read 5000": 0.22604441999283154, "shuffled read 50000": 2.268928524994408, "shuffled read_batch 50000 10": 55.44462437101174, "shuffled read_batch 50000 100": 6.876476717996411, "shuffled read_batch 50000 1000": 2.1420724369963864, "shuffled read_formatted numpy 5000": 4.8052272600034485, "shuffled read_formatted_batch numpy 5000 10": 6.500664097999106, "shuffled read_formatted_batch numpy 5000 1000": 0.0754691059992183}
diff --git a/benchmarks/results/benchmark_map_filter.json b/benchmarks/results/benchmark_map_filter.json
@@ -1 +1 @@
-{"num examples": 500000, "map identity": 10.665630593008245, "map identity batched": 0.7198751819960307, "map no-op batched": 0.5252309559873538, "map no-op batched numpy": 0.5331113779975567, "map no-op batched pandas": 0.3913883039931534, "map no-op batched pytorch": 0.5091918510006508, "map no-op batched tensorflow": 1.2273747170111164, "map fast-tokenizer batched": 8.285753931006184, "filter": 1.7507986380078364}
+{"num examples": 500000, "map identity": 10.19139202599763, "map identity batched": 0.6804238399927272, "map no-op batched": 0.5342009569867514, "map no-op batched numpy": 0.5792830920108827, "map no-op batched pandas": 0.4343639040016569, "map no-op batched pytorch": 0.5403374370071106, "map no-op batched tensorflow": 1.3869360350072384, "map fast-tokenizer batched": 8.074308118986664, "filter": 1.841787679004483}
diff --git a/datasets/wiki_dpr/wiki_dpr.py b/datasets/wiki_dpr/wiki_dpr.py
@@ -181,7 +181,6 @@ def _post_process(self, dataset, resources_paths):
                     dataset.add_faiss_index(
                         "embeddings",
                         train_size=train_size,
-                        faiss_verbose=logging.getLogger().level <= logging.DEBUG,
                         custom_index=ivf_index,
                     )
                 logging.info("Saving wiki_dpr faiss index")

diff --git a/docs/source/add_dataset.rst b/docs/source/add_dataset.rst
@@ -216,7 +216,7 @@ Here again, let's take the simple example of the `squad dataset loading script <
 
     def _generate_examples(self, filepath):
         """This function returns the examples in the raw (text) form."""
-        logging.info("generating examples from = %s", filepath)
+        logger.info("generating examples from = %s", filepath)
         with open(filepath) as f:
             squad = json.load(f)
             for article in squad["data"]:

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -73,3 +73,4 @@ The documentation is organized in five parts:
     package_reference/loading_methods
     package_reference/main_classes
     package_reference/builder_classes
+    package_reference/logging_methods
diff --git a/docs/source/package_reference/logging_methods.rst b/docs/source/package_reference/logging_methods.rst
@@ -0,0 +1,50 @@
+Logging methods
+----------------------------------------------------
+
+`nlp` tries to be very transparent and explicit about it's inner working bt this can be quite verbose at some time.
+A series of logging methods let you easily adjust the level of logging of the whole library.
+
+Functions
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: nlp.logging.get_verbosity
+
+.. autofunction:: nlp.logging.set_verbosity
+
+.. autofunction:: nlp.logging.set_verbosity_info
+
+.. autofunction:: nlp.logging.set_verbosity_warning
+
+.. autofunction:: nlp.logging.set_verbosity_debug
+
+.. autofunction:: nlp.logging.set_verbosity_error
+
+.. autofunction:: nlp.logging.disable_default_handler
+
+.. autofunction:: nlp.logging.enable_default_handler
+
+.. autofunction:: nlp.logging.disable_propagation
+
+.. autofunction:: nlp.logging.enable_propagation
+
+.. autofunction:: nlp.logging.get_logger
+
+Levels
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autodata:: nlp.logging.CRITICAL
+
+.. autodata:: nlp.logging.DEBUG
+
+.. autodata:: nlp.logging.ERROR
+
+.. autodata:: nlp.logging.FATAL
+
+.. autodata:: nlp.logging.INFO
+
+.. autodata:: nlp.logging.NOTSET
+
+.. autodata:: nlp.logging.WARN
+
+.. autodata:: nlp.logging.WARNING
+
diff --git a/metrics/bleurt/bleurt.py b/metrics/bleurt/bleurt.py
@@ -14,14 +14,14 @@
 # limitations under the License.
 """ BLEURT metric. """
 
-import logging
 import os
 
 import nlp
+from nlp.logging import get_logger
 from bleurt import score  # From: git+https://github.com/google-research/bleurt.git
 
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 _CITATION = """\
 @inproceedings{bleurt,

diff --git a/nlp-cli b/nlp-cli
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-import logging
 from argparse import ArgumentParser
 
 from nlp.commands.convert import ConvertCommand
@@ -10,8 +9,6 @@ from nlp.commands.test import TestCommand
 from nlp.commands.run_beam import RunBeamCommand
 from nlp.commands.dummy_data import DummyDataCommand
 
-logging.basicConfig(level=logging.INFO)
-
 if __name__ == '__main__':
     parser = ArgumentParser('HuggingFace NLP CLI tool', usage='nlp-cli <command> [<args>]')
     commands_parser = parser.add_subparsers(help='nlp-cli command helpers')

diff --git a/notebooks/Overview.ipynb b/notebooks/Overview.ipynb
@@ -2330,32 +2330,6 @@
         }
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "hJHyEmievSUh",
-        "colab_type": "code",
-        "outputId": "afc32e2a-6d42-4d77-fee6-0afdb5a1f206",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 34
-        }
-      },
-      "source": [
-        "import logging\n",
-        "logging.basicConfig(level=logging.INFO)"
-      ],
-      "execution_count": 0,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "17\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
     {
       "cell_type": "code",
       "metadata": {
@@ -4991,4 +4965,4 @@
       "outputs": []
     }
   ]
-}
+}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"write_array2d": 0.07093274600629229, "read_unformated after write_array2d": 0.03530075500020757, "read_formatted_as_numpy after write_array2d": 0.10929270699853078, "read_batch_unformated after write_array2d": 0.03727920600795187, "read_batch_formatted_as_numpy after write_array2d": 0.018853643006877974, "read_col_unformated after write_array2d": 0.05644163000397384, "read_col_formatted_as_numpy after write_array2d": 0.011610292000113986, "write_nested_sequence": 1.6535991109994939, "read_unformated after write_nested_sequence": 0.3739209540071897, "read_formatted_as_numpy after write_nested_sequence": 0.40762836500653066, "read_batch_unformated after write_nested_sequence": 0.3337586460111197, "read_batch_formatted_as_numpy after write_nested_sequence": 0.054717567007173784, "read_col_unformated after write_nested_sequence": 0.3173944180016406, "read_col_formatted_as_numpy after write_nested_sequence": 0.004956340009812266, "write_flattened_sequence": 1.4975415869994322, "read_unformated after write_flattened_sequence": 0.26713552299770527, "read_formatted_as_numpy after write_flattened_sequence": 0.07673935199272819, "read_batch_unformated after write_flattened_sequence": 0.25450974798877724, "read_batch_formatted_as_numpy after write_flattened_sequence": 0.009374254994327202, "read_col_unformated after write_flattened_sequence": 0.25912448299641255, "read_col_formatted_as_numpy after write_flattened_sequence": 0.004277604995877482}
Copy link Member lhoestq Aug 31, 2020 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. are these files supposed to be part of the PR ? Copy link Member Author thomwolf Aug 31, 2020 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. we don't care that much I guess but let me remove them indeed
		{"write_array2d": 0.14168284999323077, "read_unformated after write_array2d": 0.04353281999647152, "read_formatted_as_numpy after write_array2d": 0.1285462469968479, "read_batch_unformated after write_array2d": 0.023109222995117307, "read_batch_formatted_as_numpy after write_array2d": 0.011352884990628809, "read_col_unformated after write_array2d": 0.037052362007671036, "read_col_formatted_as_numpy after write_array2d": 0.007985618998645805, "write_nested_sequence": 1.4927163410029607, "read_unformated after write_nested_sequence": 0.28319963401008863, "read_formatted_as_numpy after write_nested_sequence": 0.419271487990045, "read_batch_unformated after write_nested_sequence": 0.3234798710036557, "read_batch_formatted_as_numpy after write_nested_sequence": 0.03850809299910907, "read_col_unformated after write_nested_sequence": 0.29384092400141526, "read_col_formatted_as_numpy after write_nested_sequence": 0.004250421989127062, "write_flattened_sequence": 1.4521546780015342, "read_unformated after write_flattened_sequence": 0.25513897799828555, "read_formatted_as_numpy after write_flattened_sequence": 0.07564631900459062, "read_batch_unformated after write_flattened_sequence": 0.2758980469952803, "read_batch_formatted_as_numpy after write_flattened_sequence": 0.011008214991306886, "read_col_unformated after write_flattened_sequence": 0.25848906899045687, "read_col_formatted_as_numpy after write_flattened_sequence": 0.004328447001171298}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"num examples": 500000, "select": 0.034579699000460096, "sort": 0.5558962740033166, "shuffle": 0.18372017299407162, "train_test_split": 0.29882429300050717, "shard": 0.014594822001527064}
		{"num examples": 500000, "select": 0.03741131999413483, "sort": 0.7371353159978753, "shuffle": 0.17655655200360343, "train_test_split": 0.29633847798686475, "shard": 0.01452581599005498}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"num examples": 50000, "read 5000": 0.2146801049966598, "read 50000": 2.115211837008246, "read_batch 50000 10": 1.4512460459955037, "read_batch 50000 100": 1.385236697999062, "read_batch 50000 1000": 1.4181318079936318, "read_formatted numpy 5000": 4.044872473998112, "read_formatted pandas 5000": 3.4310112629900686, "read_formatted torch 5000": 4.470335923993844, "read_formatted tensorflow 5000": 5.384795637000934, "read_formatted_batch numpy 5000 10": 0.4460094000096433, "read_formatted_batch numpy 5000 1000": 0.007665968994842842, "shuffled read 5000": 0.2283045439980924, "shuffled read 50000": 2.2466989499953343, "shuffled read_batch 50000 10": 59.94365781600936, "shuffled read_batch 50000 100": 7.204961794006522, "shuffled read_batch 50000 1000": 2.4927480350015685, "shuffled read_formatted numpy 5000": 4.631365966997691, "shuffled read_formatted_batch numpy 5000 10": 6.5569094810052775, "shuffled read_formatted_batch numpy 5000 1000": 0.06912206900597084}
		{"num examples": 50000, "read 5000": 0.2152090710005723, "read 50000": 2.077654693988734, "read_batch 50000 10": 1.5041199039987987, "read_batch 50000 100": 1.5411947140091797, "read_batch 50000 1000": 1.4684901159926085, "read_formatted numpy 5000": 4.584776938994764, "read_formatted pandas 5000": 3.7457121399929747, "read_formatted torch 5000": 4.565676491998602, "read_formatted tensorflow 5000": 5.269861594992108, "read_formatted_batch numpy 5000 10": 0.4242750950070331, "read_formatted_batch numpy 5000 1000": 0.007607111998368055, "shuffled read 5000": 0.22604441999283154, "shuffled read 50000": 2.268928524994408, "shuffled read_batch 50000 10": 55.44462437101174, "shuffled read_batch 50000 100": 6.876476717996411, "shuffled read_batch 50000 1000": 2.1420724369963864, "shuffled read_formatted numpy 5000": 4.8052272600034485, "shuffled read_formatted_batch numpy 5000 10": 6.500664097999106, "shuffled read_formatted_batch numpy 5000 1000": 0.0754691059992183}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"num examples": 500000, "map identity": 10.665630593008245, "map identity batched": 0.7198751819960307, "map no-op batched": 0.5252309559873538, "map no-op batched numpy": 0.5331113779975567, "map no-op batched pandas": 0.3913883039931534, "map no-op batched pytorch": 0.5091918510006508, "map no-op batched tensorflow": 1.2273747170111164, "map fast-tokenizer batched": 8.285753931006184, "filter": 1.7507986380078364}
		{"num examples": 500000, "map identity": 10.19139202599763, "map identity batched": 0.6804238399927272, "map no-op batched": 0.5342009569867514, "map no-op batched numpy": 0.5792830920108827, "map no-op batched pandas": 0.4343639040016569, "map no-op batched pytorch": 0.5403374370071106, "map no-op batched tensorflow": 1.3869360350072384, "map fast-tokenizer batched": 8.074308118986664, "filter": 1.841787679004483}