Skip to content

Commit eca6ba4

Browse files
lhoestqstefan-it
andauthored
Fix germeval url (#594)
* dataset: fix url of GermEval 2014 dataset (incl. version bump) * datasets: update dataset infos for GermEval 2014 * datasets: bump version for GermEval 2014 dummy data * remove dummy data hack in germeval Co-authored-by: Stefan Schweter <[email protected]>
1 parent 9cbe9e6 commit eca6ba4

File tree

4 files changed

+8
-15
lines changed

4 files changed

+8
-15
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties: - The data was sampled from German Wikipedia and News Corpora as a collection of citations. - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines, using four main NER categories with sub-structure, and annotating embeddings among NEs such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n title = \"{N}o{S}ta-D Named Entity Annotation for {G}erman: Guidelines and Dataset\",\n author = \"Benikova, Darina and\n Biemann, Chris and\n Reznicek, Marc\",\n booktitle = \"Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)\",\n month = may,\n year = \"2014\",\n address = \"Reykjavik, Iceland\",\n publisher = \"European Language Resources Association (ELRA)\",\n url = \"http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf\",\n pages = \"2524--2531\",\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}, "train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "dataset_size": 14749100, "size_in_bytes": 25038072}}
1+
{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties: - The data was sampled from German Wikipedia and News Corpora as a collection of citations. - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines, using four main NER categories with sub-structure, and annotating embeddings among NEs such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n title = {NoSta-D Named Entity Annotation for German: Guidelines and Dataset},\n author = {Benikova, Darina and\n Biemann, Chris and\n Reznicek, Marc},\n booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)},\n month = {may},\n year = {2014},\n address = {Reykjavik, Iceland},\n publisher = {European Language Resources Association (ELRA)},\n url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf},\n pages = {2524--2531},\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "2.0.0", "description": null, "nlp_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}, "test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "post_processing_size": null, "dataset_size": 14749100, "size_in_bytes": 25038072}}
-10.2 KB
Binary file not shown.
12.3 KB
Binary file not shown.

datasets/germeval_14/germeval_14.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,7 @@
1919
from __future__ import absolute_import, division, print_function
2020

2121
import csv
22-
import glob
2322
import logging
24-
import os
25-
from pathlib import Path
2623

2724
import nlp
2825

@@ -52,10 +49,11 @@
5249
such as [ORG FC Kickers [LOC Darmstadt]].
5350
"""
5451

55-
_URL = "https://sites.google.com/site/germeval2014ner/data/"
56-
_TRAINING_FILE = "NER-de-train.tsv"
57-
_DEV_FILE = "NER-de-dev.tsv"
58-
_TEST_FILE = "NER-de-test.tsv"
52+
_URLS = {
53+
"train": "https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P",
54+
"dev": "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm",
55+
"test": "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH",
56+
}
5957

6058

6159
class GermEval14Config(nlp.BuilderConfig):
@@ -75,7 +73,7 @@ class GermEval14(nlp.GeneratorBasedBuilder):
7573

7674
BUILDER_CONFIGS = [
7775
GermEval14Config(
78-
name="germeval_14", version=nlp.Version("1.0.0"), description="GermEval 2014 NER Shared Task dataset"
76+
name="germeval_14", version=nlp.Version("2.0.0"), description="GermEval 2014 NER Shared Task dataset"
7977
),
8078
]
8179

@@ -98,12 +96,7 @@ def _info(self):
9896

9997
def _split_generators(self, dl_manager):
10098
"""Returns SplitGenerators."""
101-
urls_to_download = {
102-
"train": f"{_URL}{_TRAINING_FILE}",
103-
"dev": f"{_URL}{_DEV_FILE}",
104-
"test": f"{_URL}{_TEST_FILE}",
105-
}
106-
downloaded_files = dl_manager.download_and_extract(urls_to_download)
99+
downloaded_files = dl_manager.download_and_extract(_URLS)
107100

108101
return [
109102
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),

0 commit comments

Comments
 (0)