Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datasets/ms_marco/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"v1.1": {"description": "\nStarting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. \nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset, \nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking \nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1). \n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and \nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and \nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.\n\n\nversion v1.1", "citation": "\n@article{DBLP:journals/corr/NguyenRSGTMD16,\n author = {Tri Nguyen and\n Mir Rosenberg and\n Xia Song and\n Jianfeng Gao and\n Saurabh Tiwary and\n Rangan Majumder and\n Li Deng},\n title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},\n journal = {CoRR},\n volume = {abs/1611.09268},\n year = {2016},\n url = {http://arxiv.org/abs/1611.09268},\n archivePrefix = {arXiv},\n eprint = {1611.09268},\n timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},\n biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n}\n", "homepage": "https://microsoft.github.io/msmarco/", "license": "", "features": {"answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "passages": {"feature": {"is_selected": {"dtype": "int32", "id": null, "_type": "Value"}, "passage_text": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "int32", "id": null, "_type": "Value"}, "query_type": {"dtype": "string", "id": null, "_type": "Value"}, "wellFormedAnswers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "ms_marco", "config_name": "v1.1", "version": {"version_str": "1.1.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 42710107, "num_examples": 10047, "dataset_name": "ms_marco"}, "train": {"name": "train", "num_bytes": 350884446, "num_examples": 82326, "dataset_name": "ms_marco"}, "test": {"name": "test", "num_bytes": 41020711, "num_examples": 9650, "dataset_name": "ms_marco"}}, "download_checksums": {"https://msmarco.blob.core.windows.net/msmsarcov1/train_v1.1.json.gz": {"num_bytes": 110704491, "checksum": "2aaa60df3a758137f0bb7c01fe334858477eb46fa8665ea01588e553cda6aa9f"}, "https://msmarco.blob.core.windows.net/msmsarcov1/dev_v1.1.json.gz": {"num_bytes": 13493661, "checksum": "c70fcb1de78e635cf501264891a1a56d52e7f63e69623da7dd41d89a785d67ca"}, "https://msmarco.blob.core.windows.net/msmsarcov1/test_hidden_v1.1.json": {"num_bytes": 44499856, "checksum": "083aa4f4d86ba0cedb830ca9972eff69f73cbc32b1da26b8617205f0dedea757"}}, "download_size": 168698008, "dataset_size": 434615264, "size_in_bytes": 603313272}, "v2.1": {"description": "\nStarting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. \nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset, \nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking \nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1). \n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and \nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and \nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.\n\n\nversion v2.1", "citation": "\n@article{DBLP:journals/corr/NguyenRSGTMD16,\n author = {Tri Nguyen and\n Mir Rosenberg and\n Xia Song and\n Jianfeng Gao and\n Saurabh Tiwary and\n Rangan Majumder and\n Li Deng},\n title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},\n journal = {CoRR},\n volume = {abs/1611.09268},\n year = {2016},\n url = {http://arxiv.org/abs/1611.09268},\n archivePrefix = {arXiv},\n eprint = {1611.09268},\n timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},\n biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n}\n", "homepage": "https://microsoft.github.io/msmarco/", "license": "", "features": {"answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "passages": {"feature": {"is_selected": {"dtype": "int32", "id": null, "_type": "Value"}, "passage_text": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "int32", "id": null, "_type": "Value"}, "query_type": {"dtype": "string", "id": null, "_type": "Value"}, "wellFormedAnswers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "ms_marco", "config_name": "v2.1", "version": {"version_str": "2.1.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 2, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 414286005, "num_examples": 101093, "dataset_name": "ms_marco"}, "train": {"name": "train", "num_bytes": 3466972085, "num_examples": 808731, "dataset_name": "ms_marco"}, "test": {"name": "test", "num_bytes": 406197152, "num_examples": 101092, "dataset_name": "ms_marco"}}, "download_checksums": {"https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz": {"num_bytes": 1112116929, "checksum": "e91745411ca81e441a3bb75deb71ce000dc2fc31334085b7d499982f14218fe2"}, "https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz": {"num_bytes": 138303699, "checksum": "5b3c9c20d1808ee199a930941b0d96f79e397e9234f77a1496890b138df7cb3c"}, "https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz": {"num_bytes": 133851237, "checksum": "05ac0e448450d507e7ff8e37f48a41cc2d015f5bd2c7974d2445f00a53625db6"}}, "download_size": 1384271865, "dataset_size": 4287455242, "size_in_bytes": 5671727107}}
Binary file added datasets/ms_marco/dummy/v1.1/1.1.0/dummy_data.zip
Binary file not shown.
196 changes: 196 additions & 0 deletions datasets/ms_marco/ms_marco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""MS MARCO dataset."""

from __future__ import absolute_import, division, print_function

import json

import nlp


_CITATION = """
@article{DBLP:journals/corr/NguyenRSGTMD16,
author = {Tri Nguyen and
Mir Rosenberg and
Xia Song and
Jianfeng Gao and
Saurabh Tiwary and
Rangan Majumder and
Li Deng},
title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},
journal = {CoRR},
volume = {abs/1611.09268},
year = {2016},
url = {http://arxiv.org/abs/1611.09268},
archivePrefix = {arXiv},
eprint = {1611.09268},
timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},
biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
}
"""

_DESCRIPTION = """
Starting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.

The first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer.
Since then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset,
keyphrase extraction dataset, crawling dataset, and a conversational search.

There have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking
submissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions

This data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1).

The original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.

The current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and
is much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and
builds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.

"""
_V2_URLS = {
"train": "https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz",
"dev": "https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz",
"test": "https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz",
}

_V1_URLS = {
"train": "https://msmarco.blob.core.windows.net/msmsarcov1/train_v1.1.json.gz",
"dev": "https://msmarco.blob.core.windows.net/msmsarcov1/dev_v1.1.json.gz",
"test": "https://msmarco.blob.core.windows.net/msmsarcov1/test_hidden_v1.1.json",
}


class MsMarcoConfig(nlp.BuilderConfig):
"""BuilderConfig for MS MARCO."""

def __init__(self, **kwargs):
"""BuilderConfig for MS MARCO

Args:
**kwargs: keyword arguments forwarded to super.
"""
super(MsMarcoConfig, self).__init__(**kwargs)


class MsMarco(nlp.GeneratorBasedBuilder):

BUILDER_CONFIGS = [
MsMarcoConfig(
name="v1.1",
description="""version v1.1""",
version=nlp.Version("1.1.0", "New split API (https://tensorflow.org/datasets/splits)"),
),
MsMarcoConfig(
name="v2.1",
description="""version v2.1""",
version=nlp.Version("2.1.0", "New split API (https://tensorflow.org/datasets/splits)"),
),
]

def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION + "\n" + self.config.description,
features=nlp.Features(
{
"answers": nlp.features.Sequence(nlp.Value("string")),
"passages": nlp.features.Sequence(
{
"is_selected": nlp.Value("int32"),
"passage_text": nlp.Value("string"),
"url": nlp.Value("string"),
}
),
"query": nlp.Value("string"),
"query_id": nlp.Value("int32"),
"query_type": nlp.Value("string"),
"wellFormedAnswers": nlp.features.Sequence(nlp.Value("string")),
}
),
homepage="https://microsoft.github.io/msmarco/",
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
if self.config.name == "v2.1":
dl_path = dl_manager.download_and_extract(_V2_URLS)
else:
dl_path = dl_manager.download_and_extract(_V1_URLS)
return [
nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": dl_path["dev"]},),
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": dl_path["train"]},),
nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": dl_path["test"]},),
]

def _generate_examples(self, filepath):
"""Yields examples."""
with open(filepath) as f:
if self.config.name == "v2.1":
data = json.load(f)
questions = data["query"]
answers = data.get("answers", {})
passages = data["passages"]
query_ids = data["query_id"]
query_types = data["query_type"]
wellFormedAnswers = data.get("wellFormedAnswers", {})
for key in questions:

is_selected = [passage.get("is_selected", -1) for passage in passages[key]]
passage_text = [passage["passage_text"] for passage in passages[key]]
urls = [passage["url"] for passage in passages[key]]
question = questions[key]
answer = answers.get(key, [])
query_id = query_ids[key]
query_type = query_types[key]
wellFormedAnswer = wellFormedAnswers.get(key, [])
if wellFormedAnswer == "[]":
wellFormedAnswer = []
yield query_id, {
"answers": answer,
"passages": {"is_selected": is_selected, "passage_text": passage_text, "url": urls},
"query": question,
"query_id": query_id,
"query_type": query_type,
"wellFormedAnswers": wellFormedAnswer,
}
if self.config.name == "v1.1":
for row in f:
data = json.loads(row)
question = data["query"]
answer = data.get("answers", [])
passages = data["passages"]
query_id = data["query_id"]
query_type = data["query_type"]
wellFormedAnswer = data.get("wellFormedAnswers", [])

is_selected = [passage.get("is_selected", -1) for passage in passages]
passage_text = [passage["passage_text"] for passage in passages]
urls = [passage["url"] for passage in passages]
if wellFormedAnswer == "[]":
wellFormedAnswer = []
yield query_id, {
"answers": answer,
"passages": {"is_selected": is_selected, "passage_text": passage_text, "url": urls},
"query": question,
"query_id": query_id,
"query_type": query_type,
"wellFormedAnswers": wellFormedAnswer,
}