Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/germeval_14/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties: - The data was sampled from German Wikipedia and News Corpora as a collection of citations. - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines, using four main NER categories with sub-structure, and annotating embeddings among NEs such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n title = \"{N}o{S}ta-D Named Entity Annotation for {G}erman: Guidelines and Dataset\",\n author = \"Benikova, Darina and\n Biemann, Chris and\n Reznicek, Marc\",\n booktitle = \"Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)\",\n month = may,\n year = \"2014\",\n address = \"Reykjavik, Iceland\",\n publisher = \"European Language Resources Association (ELRA)\",\n url = \"http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf\",\n pages = \"2524--2531\",\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}, "train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "dataset_size": 14749100, "size_in_bytes": 25038072}}
{"germeval_14": {"description": "The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation with the following properties: - The data was sampled from German Wikipedia and News Corpora as a collection of citations. - The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - The NER annotation uses the NoSta-D guidelines, which extend the T\u00fcbingen Treebank guidelines, using four main NER categories with sub-structure, and annotating embeddings among NEs such as [ORG FC Kickers [LOC Darmstadt]].\n", "citation": "@inproceedings{benikova-etal-2014-nosta,\n title = {NoSta-D Named Entity Annotation for German: Guidelines and Dataset},\n author = {Benikova, Darina and\n Biemann, Chris and\n Reznicek, Marc},\n booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)},\n month = {may},\n year = {2014},\n address = {Reykjavik, Iceland},\n publisher = {European Language Resources Association (ELRA)},\n url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/276_Paper.pdf},\n pages = {2524--2531},\n}\n", "homepage": "https://sites.google.com/site/germeval2014ner/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "nested-labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "germ_eval14", "config_name": "germeval_14", "version": {"version_str": "2.0.0", "description": null, "nlp_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11305075, "num_examples": 24000, "dataset_name": "germ_eval14"}, "validation": {"name": "validation", "num_bytes": 1036186, "num_examples": 2200, "dataset_name": "germ_eval14"}, "test": {"name": "test", "num_bytes": 2407839, "num_examples": 5100, "dataset_name": "germ_eval14"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P": {"num_bytes": 7882358, "checksum": "1e5a803d81f5fe6ade54700a7e8e9107a45edba80469d42e41a360550d1758e7"}, "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm": {"num_bytes": 723876, "checksum": "d69d1347847e3ac0d1bfd14d7e5c0713dcb82899624301ced6df807dbb070056"}, "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH": {"num_bytes": 1682738, "checksum": "9405e49532379f3aee048851d116b35823d31c04e9521b87a9c4e6572c269097"}}, "download_size": 10288972, "post_processing_size": null, "dataset_size": 14749100, "size_in_bytes": 25038072}}
23 changes: 12 additions & 11 deletions datasets/germeval_14/germeval_14.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,11 @@
such as [ORG FC Kickers [LOC Darmstadt]].
"""

_URL = "https://sites.google.com/site/germeval2014ner/data/"
_TRAINING_FILE = "NER-de-train.tsv"
_DEV_FILE = "NER-de-dev.tsv"
_TEST_FILE = "NER-de-test.tsv"
_URLS = {
"train": "https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P",
"dev": "https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm",
"test": "https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH",
}


class GermEval14Config(nlp.BuilderConfig):
Expand All @@ -75,7 +76,7 @@ class GermEval14(nlp.GeneratorBasedBuilder):

BUILDER_CONFIGS = [
GermEval14Config(
name="germeval_14", version=nlp.Version("1.0.0"), description="GermEval 2014 NER Shared Task dataset"
name="germeval_14", version=nlp.Version("2.0.0"), description="GermEval 2014 NER Shared Task dataset"
),
]

Expand All @@ -98,12 +99,12 @@ def _info(self):

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
urls_to_download = {
"train": f"{_URL}{_TRAINING_FILE}",
"dev": f"{_URL}{_DEV_FILE}",
"test": f"{_URL}{_TEST_FILE}",
}
downloaded_files = dl_manager.download_and_extract(urls_to_download)
downloaded_files = {}
for dataset in _URLS.keys():
downloaded_files[dataset] = dl_manager.download_and_extract(_URLS[dataset])
# Fix for dummy data
if os.path.isdir(downloaded_files[dataset]):
downloaded_files[dataset] = os.path.join(downloaded_files[dataset], f"NER-de-{dataset}.tsv")

return [
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
Expand Down