huggingface · stefan-it · Sep 9, 2020 · Sep 9, 2020 · Sep 9, 2020
diff --git a/datasets/germeval_14/dataset_infos.json b/datasets/germeval_14/dataset_infos.json
@@ -1 +1 @@
-{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties:    - The data was sampled from German Wikipedia and News Corpora as a collection of citations.    - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens.    - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines,      using four main NER categories with sub-structure, and annotating embeddings among NEs      such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n    title = \"{N}o{S}ta-D Named Entity Annotation for {G}erman: Guidelines and Dataset\",\n    author = \"Benikova, Darina  and\n      Biemann, Chris  and\n      Reznicek, Marc\",\n    booktitle = \"Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)\",\n    month = may,\n    year = \"2014\",\n    address = \"Reykjavik, Iceland\",\n    publisher = \"European Language Resources Association (ELRA)\",\n    url = \"http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf\",\n    pages = \"2524--2531\",\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}, "train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "dataset_size": 14749100, "size_in_bytes": 25038072}}
+{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties:    - The data was sampled from German Wikipedia and News Corpora as a collection of citations.    - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens.    - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines,      using four main NER categories with sub-structure, and annotating embeddings among NEs      such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n    title = {NoSta-D Named Entity Annotation for German: Guidelines and Dataset},\n    author = {Benikova, Darina  and\n      Biemann, Chris  and\n      Reznicek, Marc},\n    booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)},\n    month = {may},\n    year = {2014},\n    address = {Reykjavik, Iceland},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf},\n    pages = {2524--2531},\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "2.0.0", "description": null, "nlp_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}, "test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "post_processing_size": null, "dataset_size": 14749100, "size_in_bytes": 25038072}}
diff --git a/...14/dummy/germeval_14/1.0.0/dummy_data.zip → ...14/dummy/germeval_14/2.0.0/dummy_data.zip b/...14/dummy/germeval_14/1.0.0/dummy_data.zip → ...14/dummy/germeval_14/2.0.0/dummy_data.zip
diff --git a/datasets/germeval_14/germeval_14.py b/datasets/germeval_14/germeval_14.py
@@ -52,10 +52,11 @@
       such as [ORG FC Kickers [LOC Darmstadt]].
 """
 
-_URL = "https://sites.google.com/site/germeval2014ner/data/"
-_TRAINING_FILE = "NER-de-train.tsv"
-_DEV_FILE = "NER-de-dev.tsv"
-_TEST_FILE = "NER-de-test.tsv"
+_URLS = {
+    "train": "https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P",
+    "dev": "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm",
+    "test": "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH",
+}
 
 
 class GermEval14Config(nlp.BuilderConfig):
@@ -75,7 +76,7 @@ class GermEval14(nlp.GeneratorBasedBuilder):
 
     BUILDER_CONFIGS = [
         GermEval14Config(
-            name="germeval_14", version=nlp.Version("1.0.0"), description="GermEval 2014 NER Shared Task dataset"
+            name="germeval_14", version=nlp.Version("2.0.0"), description="GermEval 2014 NER Shared Task dataset"
         ),
     ]
 
@@ -98,12 +99,12 @@ def _info(self):
 
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        urls_to_download = {
-            "train": f"{_URL}{_TRAINING_FILE}",
-            "dev": f"{_URL}{_DEV_FILE}",
-            "test": f"{_URL}{_TEST_FILE}",
-        }
-        downloaded_files = dl_manager.download_and_extract(urls_to_download)
+        downloaded_files = {}
+        for dataset in _URLS.keys():
+            downloaded_files[dataset] = dl_manager.download_and_extract(_URLS[dataset])
+            #  Fix for dummy data
+            if os.path.isdir(downloaded_files[dataset]):
+                downloaded_files[dataset] = os.path.join(downloaded_files[dataset], f"NER-de-{dataset}.tsv")
 
         return [
             nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties: - The data was sampled from German Wikipedia and News Corpora as a collection of citations. - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines, using four main NER categories with sub-structure, and annotating embeddings among NEs such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n title = \"{N}o{S}ta-D Named Entity Annotation for {G}erman: Guidelines and Dataset\",\n author = \"Benikova, Darina and\n Biemann, Chris and\n Reznicek, Marc\",\n booktitle = \"Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)\",\n month = may,\n year = \"2014\",\n address = \"Reykjavik, Iceland\",\n publisher = \"European Language Resources Association (ELRA)\",\n url = \"http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf\",\n pages = \"2524--2531\",\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}, "train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "dataset_size": 14749100, "size_in_bytes": 25038072}}
		{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties: - The data was sampled from German Wikipedia and News Corpora as a collection of citations. - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines, using four main NER categories with sub-structure, and annotating embeddings among NEs such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n title = {NoSta-D Named Entity Annotation for German: Guidelines and Dataset},\n author = {Benikova, Darina and\n Biemann, Chris and\n Reznicek, Marc},\n booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)},\n month = {may},\n year = {2014},\n address = {Reykjavik, Iceland},\n publisher = {European Language Resources Association (ELRA)},\n url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf},\n pages = {2524--2531},\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "2.0.0", "description": null, "nlp_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}, "test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "post_processing_size": null, "dataset_size": 14749100, "size_in_bytes": 25038072}}