Skip to content

Commit 38eb241

Browse files
mariamabarhamMariama Dramepatrickvonplaten
authored
add web_questions (#401)
* add web_questions * fix web questions dummy data Co-authored-by: Mariama Drame <mariama@debmower_ajd> Co-authored-by: Patrick von Platen <[email protected]>
1 parent e630d77 commit 38eb241

File tree

3 files changed

+100
-0
lines changed

3 files changed

+100
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"default": {"description": "This dataset consists of 6,642 question/answer pairs.\nThe questions are supposed to be answerable by Freebase, a large knowledge graph.\nThe questions are mostly centered around a single named entity.\nThe questions are popular ones asked on the web (at least in 2013).\n", "citation": "\n@inproceedings{berant-etal-2013-semantic,\n title = \"Semantic Parsing on {F}reebase from Question-Answer Pairs\",\n author = \"Berant, Jonathan and\n Chou, Andrew and\n Frostig, Roy and\n Liang, Percy\",\n booktitle = \"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing\",\n month = oct,\n year = \"2013\",\n address = \"Seattle, Washington, USA\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/D13-1160\",\n pages = \"1533--1544\",\n}\n", "homepage": "https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "web_questions", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 533736, "num_examples": 3778, "dataset_name": "web_questions"}, "test": {"name": "test", "num_bytes": 289824, "num_examples": 2032, "dataset_name": "web_questions"}}, "download_checksums": {"https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/": {"num_bytes": 825320, "checksum": "fb1797e4554a1b1be642388367de1379f8c0d5afc609ac171492c67f7b70cb1e"}, "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/": {"num_bytes": 447645, "checksum": "e3d4550e90660aaabe18458ba34b59f2624857273f375af7353273ce8b84ce6e"}}, "download_size": 1272965, "dataset_size": 823560, "size_in_bytes": 2096525}}
860 Bytes
Binary file not shown.
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""WebQuestions Benchmark for Question Answering."""
18+
19+
from __future__ import absolute_import, division, print_function
20+
21+
import json
22+
import re
23+
24+
import nlp
25+
26+
27+
_CITATION = """
28+
@inproceedings{berant-etal-2013-semantic,
29+
title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
30+
author = "Berant, Jonathan and
31+
Chou, Andrew and
32+
Frostig, Roy and
33+
Liang, Percy",
34+
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
35+
month = oct,
36+
year = "2013",
37+
address = "Seattle, Washington, USA",
38+
publisher = "Association for Computational Linguistics",
39+
url = "https://www.aclweb.org/anthology/D13-1160",
40+
pages = "1533--1544",
41+
}
42+
"""
43+
_SPLIT_DOWNLOAD_URL = {
44+
"train": "https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/",
45+
"test": "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/",
46+
}
47+
48+
_DESCRIPTION = """\
49+
This dataset consists of 6,642 question/answer pairs.
50+
The questions are supposed to be answerable by Freebase, a large knowledge graph.
51+
The questions are mostly centered around a single named entity.
52+
The questions are popular ones asked on the web (at least in 2013).
53+
"""
54+
55+
56+
class WebQuestions(nlp.GeneratorBasedBuilder):
57+
"""WebQuestions Benchmark for Question Answering."""
58+
59+
VERSION = nlp.Version("1.0.0")
60+
61+
def _info(self):
62+
return nlp.DatasetInfo(
63+
description=_DESCRIPTION,
64+
features=nlp.Features(
65+
{
66+
"url": nlp.Value("string"),
67+
"question": nlp.Value("string"),
68+
"answers": nlp.features.Sequence(nlp.Value("string")),
69+
}
70+
),
71+
supervised_keys=None,
72+
homepage="https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a",
73+
citation=_CITATION,
74+
)
75+
76+
def _split_generators(self, dl_manager):
77+
"""Returns SplitGenerators."""
78+
file_paths = dl_manager.download(_SPLIT_DOWNLOAD_URL)
79+
80+
return [
81+
nlp.SplitGenerator(name=split, gen_kwargs={"file_path": file_path})
82+
for split, file_path in file_paths.items()
83+
]
84+
85+
def _generate_examples(self, file_path):
86+
"""Parses split file and yields examples."""
87+
88+
def _target_to_answers(target):
89+
target = re.sub(r"^\(list |\)$", "", target)
90+
return ["".join(ans) for ans in re.findall(r'\(description (?:"([^"]+?)"|([^)]+?))\)\w*', target)]
91+
92+
with open(file_path) as f:
93+
examples = json.load(f)
94+
for i, ex in enumerate(examples):
95+
yield i, {
96+
"url": ex["url"],
97+
"question": ex["utterance"],
98+
"answers": _target_to_answers(ex["targetValue"]),
99+
}

0 commit comments

Comments
 (0)