huggingface · mariamabarham · Aug 6, 2020 · Aug 5, 2020 · Aug 5, 2020
diff --git a/datasets/ms_marco/dataset_infos.json b/datasets/ms_marco/dataset_infos.json
@@ -0,0 +1 @@
+{"v1.1": {"description": "\nStarting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. \nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset, \nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking \nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1). \n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and \nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and \nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.\n\n\nversion v1.1", "citation": "\n@article{DBLP:journals/corr/NguyenRSGTMD16,\n  author    = {Tri Nguyen and\n               Mir Rosenberg and\n               Xia Song and\n               Jianfeng Gao and\n               Saurabh Tiwary and\n               Rangan Majumder and\n               Li Deng},\n  title     = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},\n  journal   = {CoRR},\n  volume    = {abs/1611.09268},\n  year      = {2016},\n  url       = {http://arxiv.org/abs/1611.09268},\n  archivePrefix = {arXiv},\n  eprint    = {1611.09268},\n  timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},\n  biburl    = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n}\n", "homepage": "https://microsoft.github.io/msmarco/", "license": "", "features": {"answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "passages": {"feature": {"is_selected": {"dtype": "int32", "id": null, "_type": "Value"}, "passage_text": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "int32", "id": null, "_type": "Value"}, "query_type": {"dtype": "string", "id": null, "_type": "Value"}, "wellFormedAnswers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "ms_marco", "config_name": "v1.1", "version": {"version_str": "1.1.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 42710107, "num_examples": 10047, "dataset_name": "ms_marco"}, "train": {"name": "train", "num_bytes": 350884446, "num_examples": 82326, "dataset_name": "ms_marco"}, "test": {"name": "test", "num_bytes": 41020711, "num_examples": 9650, "dataset_name": "ms_marco"}}, "download_checksums": {"https://msmarco.blob.core.windows.net/msmsarcov1/train_v1.1.json.gz": {"num_bytes": 110704491, "checksum": "2aaa60df3a758137f0bb7c01fe334858477eb46fa8665ea01588e553cda6aa9f"}, "https://msmarco.blob.core.windows.net/msmsarcov1/dev_v1.1.json.gz": {"num_bytes": 13493661, "checksum": "c70fcb1de78e635cf501264891a1a56d52e7f63e69623da7dd41d89a785d67ca"}, "https://msmarco.blob.core.windows.net/msmsarcov1/test_hidden_v1.1.json": {"num_bytes": 44499856, "checksum": "083aa4f4d86ba0cedb830ca9972eff69f73cbc32b1da26b8617205f0dedea757"}}, "download_size": 168698008, "dataset_size": 434615264, "size_in_bytes": 603313272}, "v2.1": {"description": "\nStarting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. \nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset, \nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking \nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1). \n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and \nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and \nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.\n\n\nversion v2.1", "citation": "\n@article{DBLP:journals/corr/NguyenRSGTMD16,\n  author    = {Tri Nguyen and\n               Mir Rosenberg and\n               Xia Song and\n               Jianfeng Gao and\n               Saurabh Tiwary and\n               Rangan Majumder and\n               Li Deng},\n  title     = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},\n  journal   = {CoRR},\n  volume    = {abs/1611.09268},\n  year      = {2016},\n  url       = {http://arxiv.org/abs/1611.09268},\n  archivePrefix = {arXiv},\n  eprint    = {1611.09268},\n  timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},\n  biburl    = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n}\n", "homepage": "https://microsoft.github.io/msmarco/", "license": "", "features": {"answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "passages": {"feature": {"is_selected": {"dtype": "int32", "id": null, "_type": "Value"}, "passage_text": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "int32", "id": null, "_type": "Value"}, "query_type": {"dtype": "string", "id": null, "_type": "Value"}, "wellFormedAnswers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "ms_marco", "config_name": "v2.1", "version": {"version_str": "2.1.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 2, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 414286005, "num_examples": 101093, "dataset_name": "ms_marco"}, "train": {"name": "train", "num_bytes": 3466972085, "num_examples": 808731, "dataset_name": "ms_marco"}, "test": {"name": "test", "num_bytes": 406197152, "num_examples": 101092, "dataset_name": "ms_marco"}}, "download_checksums": {"https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz": {"num_bytes": 1112116929, "checksum": "e91745411ca81e441a3bb75deb71ce000dc2fc31334085b7d499982f14218fe2"}, "https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz": {"num_bytes": 138303699, "checksum": "5b3c9c20d1808ee199a930941b0d96f79e397e9234f77a1496890b138df7cb3c"}, "https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz": {"num_bytes": 133851237, "checksum": "05ac0e448450d507e7ff8e37f48a41cc2d015f5bd2c7974d2445f00a53625db6"}}, "download_size": 1384271865, "dataset_size": 4287455242, "size_in_bytes": 5671727107}}
diff --git a/datasets/ms_marco/dummy/v1.1/1.1.0/dummy_data.zip b/datasets/ms_marco/dummy/v1.1/1.1.0/dummy_data.zip
diff --git a/datasets/ms_marco/ms_marco.py b/datasets/ms_marco/ms_marco.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""MS MARCO dataset."""
+
+from __future__ import absolute_import, division, print_function
+
+import json
+
+import nlp
+
+
+_CITATION = """
+@article{DBLP:journals/corr/NguyenRSGTMD16,
+  author    = {Tri Nguyen and
+               Mir Rosenberg and
+               Xia Song and
+               Jianfeng Gao and
+               Saurabh Tiwary and
+               Rangan Majumder and
+               Li Deng},
+  title     = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},
+  journal   = {CoRR},
+  volume    = {abs/1611.09268},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1611.09268},
+  archivePrefix = {arXiv},
+  eprint    = {1611.09268},
+  timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+}
+"""
+
+_DESCRIPTION = """
+Starting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.
+
+The first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer.
+Since then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset,
+keyphrase extraction dataset, crawling dataset, and a conversational search.
+
+There have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking
+submissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions
+
+This data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1).
+
+The original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.
+
+The current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and
+is much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and
+builds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.
+
+"""
+_V2_URLS = {
+    "train": "https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz",
+    "dev": "https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz",
+    "test": "https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz",
+}
+
+_V1_URLS = {
+    "train": "https://msmarco.blob.core.windows.net/msmsarcov1/train_v1.1.json.gz",
+    "dev": "https://msmarco.blob.core.windows.net/msmsarcov1/dev_v1.1.json.gz",
+    "test": "https://msmarco.blob.core.windows.net/msmsarcov1/test_hidden_v1.1.json",
+}
+
+
+class MsMarcoConfig(nlp.BuilderConfig):
+    """BuilderConfig for MS MARCO."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for MS MARCO
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(MsMarcoConfig, self).__init__(**kwargs)
+
+
+class MsMarco(nlp.GeneratorBasedBuilder):
+
+    BUILDER_CONFIGS = [
+        MsMarcoConfig(
+            name="v1.1",
+            description="""version v1.1""",
+            version=nlp.Version("1.1.0", "New split API (https://tensorflow.org/datasets/splits)"),
+        ),
+        MsMarcoConfig(
+            name="v2.1",
+            description="""version v2.1""",
+            version=nlp.Version("2.1.0", "New split API (https://tensorflow.org/datasets/splits)"),
+        ),
+    ]
+
+    def _info(self):
+        return nlp.DatasetInfo(
+            description=_DESCRIPTION + "\n" + self.config.description,
+            features=nlp.Features(
+                {
+                    "answers": nlp.features.Sequence(nlp.Value("string")),
+                    "passages": nlp.features.Sequence(
+                        {
+                            "is_selected": nlp.Value("int32"),
+                            "passage_text": nlp.Value("string"),
+                            "url": nlp.Value("string"),
+                        }
+                    ),
+                    "query": nlp.Value("string"),
+                    "query_id": nlp.Value("int32"),
+                    "query_type": nlp.Value("string"),
+                    "wellFormedAnswers": nlp.features.Sequence(nlp.Value("string")),
+                }
+            ),
+            homepage="https://microsoft.github.io/msmarco/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        if self.config.name == "v2.1":
+            dl_path = dl_manager.download_and_extract(_V2_URLS)
+        else:
+            dl_path = dl_manager.download_and_extract(_V1_URLS)
+        return [
+            nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": dl_path["dev"]},),
+            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": dl_path["train"]},),
+            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": dl_path["test"]},),
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        with open(filepath) as f:
+            if self.config.name == "v2.1":
+                data = json.load(f)
+                questions = data["query"]
+                answers = data.get("answers", {})
+                passages = data["passages"]
+                query_ids = data["query_id"]
+                query_types = data["query_type"]
+                wellFormedAnswers = data.get("wellFormedAnswers", {})
+                for key in questions:
+
+                    is_selected = [passage.get("is_selected", -1) for passage in passages[key]]
+                    passage_text = [passage["passage_text"] for passage in passages[key]]
+                    urls = [passage["url"] for passage in passages[key]]
+                    question = questions[key]
+                    answer = answers.get(key, [])
+                    query_id = query_ids[key]
+                    query_type = query_types[key]
+                    wellFormedAnswer = wellFormedAnswers.get(key, [])
+                    if wellFormedAnswer == "[]":
+                        wellFormedAnswer = []
+                    yield query_id, {
+                        "answers": answer,
+                        "passages": {"is_selected": is_selected, "passage_text": passage_text, "url": urls},
+                        "query": question,
+                        "query_id": query_id,
+                        "query_type": query_type,
+                        "wellFormedAnswers": wellFormedAnswer,
+                    }
+            if self.config.name == "v1.1":
+                for row in f:
+                    data = json.loads(row)
+                    question = data["query"]
+                    answer = data.get("answers", [])
+                    passages = data["passages"]
+                    query_id = data["query_id"]
+                    query_type = data["query_type"]
+                    wellFormedAnswer = data.get("wellFormedAnswers", [])
+
+                    is_selected = [passage.get("is_selected", -1) for passage in passages]
+                    passage_text = [passage["passage_text"] for passage in passages]
+                    urls = [passage["url"] for passage in passages]
+                    if wellFormedAnswer == "[]":
+                        wellFormedAnswer = []
+                    yield query_id, {
+                        "answers": answer,
+                        "passages": {"is_selected": is_selected, "passage_text": passage_text, "url": urls},
+                        "query": question,
+                        "query_id": query_id,
+                        "query_type": query_type,
+                        "wellFormedAnswers": wellFormedAnswer,
+                    }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"v1.1": {"description": "\nStarting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. \nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset, \nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking \nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1). \n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and \nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and \nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.\n\n\nversion v1.1", "citation": "\n@article{DBLP:journals/corr/NguyenRSGTMD16,\n author = {Tri Nguyen and\n Mir Rosenberg and\n Xia Song and\n Jianfeng Gao and\n Saurabh Tiwary and\n Rangan Majumder and\n Li Deng},\n title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},\n journal = {CoRR},\n volume = {abs/1611.09268},\n year = {2016},\n url = {http://arxiv.org/abs/1611.09268},\n archivePrefix = {arXiv},\n eprint = {1611.09268},\n timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},\n biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n}\n", "homepage": "https://microsoft.github.io/msmarco/", "license": "", "features": {"answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "passages": {"feature": {"is_selected": {"dtype": "int32", "id": null, "_type": "Value"}, "passage_text": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "int32", "id": null, "_type": "Value"}, "query_type": {"dtype": "string", "id": null, "_type": "Value"}, "wellFormedAnswers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "ms_marco", "config_name": "v1.1", "version": {"version_str": "1.1.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 42710107, "num_examples": 10047, "dataset_name": "ms_marco"}, "train": {"name": "train", "num_bytes": 350884446, "num_examples": 82326, "dataset_name": "ms_marco"}, "test": {"name": "test", "num_bytes": 41020711, "num_examples": 9650, "dataset_name": "ms_marco"}}, "download_checksums": {"https://msmarco.blob.core.windows.net/msmsarcov1/train_v1.1.json.gz": {"num_bytes": 110704491, "checksum": "2aaa60df3a758137f0bb7c01fe334858477eb46fa8665ea01588e553cda6aa9f"}, "https://msmarco.blob.core.windows.net/msmsarcov1/dev_v1.1.json.gz": {"num_bytes": 13493661, "checksum": "c70fcb1de78e635cf501264891a1a56d52e7f63e69623da7dd41d89a785d67ca"}, "https://msmarco.blob.core.windows.net/msmsarcov1/test_hidden_v1.1.json": {"num_bytes": 44499856, "checksum": "083aa4f4d86ba0cedb830ca9972eff69f73cbc32b1da26b8617205f0dedea757"}}, "download_size": 168698008, "dataset_size": 434615264, "size_in_bytes": 603313272}, "v2.1": {"description": "\nStarting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. \nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset, \nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking \nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1). \n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and \nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and \nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.\n\n\nversion v2.1", "citation": "\n@article{DBLP:journals/corr/NguyenRSGTMD16,\n author = {Tri Nguyen and\n Mir Rosenberg and\n Xia Song and\n Jianfeng Gao and\n Saurabh Tiwary and\n Rangan Majumder and\n Li Deng},\n title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},\n journal = {CoRR},\n volume = {abs/1611.09268},\n year = {2016},\n url = {http://arxiv.org/abs/1611.09268},\n archivePrefix = {arXiv},\n eprint = {1611.09268},\n timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},\n biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n}\n", "homepage": "https://microsoft.github.io/msmarco/", "license": "", "features": {"answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "passages": {"feature": {"is_selected": {"dtype": "int32", "id": null, "_type": "Value"}, "passage_text": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "int32", "id": null, "_type": "Value"}, "query_type": {"dtype": "string", "id": null, "_type": "Value"}, "wellFormedAnswers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "ms_marco", "config_name": "v2.1", "version": {"version_str": "2.1.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 2, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 414286005, "num_examples": 101093, "dataset_name": "ms_marco"}, "train": {"name": "train", "num_bytes": 3466972085, "num_examples": 808731, "dataset_name": "ms_marco"}, "test": {"name": "test", "num_bytes": 406197152, "num_examples": 101092, "dataset_name": "ms_marco"}}, "download_checksums": {"https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz": {"num_bytes": 1112116929, "checksum": "e91745411ca81e441a3bb75deb71ce000dc2fc31334085b7d499982f14218fe2"}, "https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz": {"num_bytes": 138303699, "checksum": "5b3c9c20d1808ee199a930941b0d96f79e397e9234f77a1496890b138df7cb3c"}, "https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz": {"num_bytes": 133851237, "checksum": "05ac0e448450d507e7ff8e37f48a41cc2d015f5bd2c7974d2445f00a53625db6"}}, "download_size": 1384271865, "dataset_size": 4287455242, "size_in_bytes": 5671727107}}