-
Notifications
You must be signed in to change notification settings - Fork 3k
Adding the KILT knowledge source and tasks #559
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| @@ -0,0 +1 @@ | |||
| {"all_tasks": {"description": "KILT tasks training and evaluation data.\n- [FEVER](https://fever.ai) | Fact Checking | fever\n- [Natural Questions](https://ai.google.com/research/NaturalQuestions) | Open Domain QA | nq\n- [HotpotQA](https://hotpotqa.github.io) | Open Domain QA | hotpotqa\n- [TriviaQA](http://nlp.cs.washington.edu/triviaqa) | Open Domain QA | triviaqa\n- [ELI5](https://facebookresearch.github.io/ELI5/explore.html) | Open Domain QA | eli5\n- [T-REx](https://hadyelsahar.github.io/t-rex) | Slot Filling | trex\n- [Zero-Shot RE](http://nlp.cs.washington.edu/zeroshot) | Slot Filling | structured_zeroshot\n- [AIDA CoNLL-YAGO](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads) | Entity Linking | aidayago2\n- [WNED-WIKI](https://github.com/U-Alberta/wned) | Entity Linking | wned\n- [WNED-CWEB](https://github.com/U-Alberta/wned) | Entity Linking | cweb\n- [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia) | Dialogue | wow\n", "citation": "@inproceedings{fb_kilt,\n author = {Fabio Petroni and\n Aleksandra Piktus and\n Angela Fan and\n Patrick Lewis and\n Majid Yazdani and\n Nicola De Cao and\n James Thorne and\n Yacine Jernite and\n Vassilis Plachouras and\n Tim Rockt\"aschel and\n Sebastian Riedel},\n title = {{KILT:} a {B}enchmark for {K}nowledge {I}ntensive {L}anguage {T}asks},\n journal = {CoRR},\n archivePrefix = {arXiv},\n year = {2020},\n", "homepage": "https://facebookresearch.github.io/KILT/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "meta": {"left_context": {"dtype": "string", "id": null, "_type": "Value"}, "mention": {"dtype": "string", "id": null, "_type": "Value"}, "right_context": {"dtype": "string", "id": null, "_type": "Value"}, "partial_evidence": {"feature": {"start_paragraph_id": {"dtype": "int32", "id": null, "_type": "Value"}, "end_paragraph_id": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "section": {"dtype": "string", "id": null, "_type": "Value"}, "wikipedia_id": {"dtype": "string", "id": null, "_type": "Value"}, "meta": {"feature": {"evidence_span": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "obj_surface": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "sub_surface": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "subj_aliases": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "template_questions": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "output": {"feature": {"answer": {"dtype": "string", "id": null, "_type": "Value"}, "meta": {"score": {"dtype": "int32", "id": null, "_type": "Value"}}, "provenance": {"feature": {"bleu_score": {"dtype": "float32", "id": null, "_type": "Value"}, "start_character": {"dtype": "int32", "id": null, "_type": "Value"}, "start_paragraph_id": {"dtype": "int32", "id": null, "_type": "Value"}, "end_character": {"dtype": "int32", "id": null, "_type": "Value"}, "end_paragraph_id": {"dtype": "int32", "id": null, "_type": "Value"}, "meta": {"fever_page_id": {"dtype": "string", "id": null, "_type": "Value"}, "fever_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "annotation_id": {"dtype": "string", "id": null, "_type": "Value"}, "yes_no_answer": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_span": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "section": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "wikipedia_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": {"features": null, "resources_checksums": {"train_fever": {}, "validation_fever": {}, "test_fever": {}, "train_nq": {}, "validation_nq": {}, "test_nq": {}, "train_hotpotqa": {}, "validation_hotpotqa": {}, "test_hotpotqa": {}, "train_triviaqa": {}, "validation_triviaqa": {}, "test_triviaqa": {}, "train_eli5": {}, "validation_eli5": {}, "test_eli5": {}, "train_trex": {}, "validation_trex": {}, "test_trex": {}, "train_structured_zeroshot": {}, "validation_structured_zeroshot": {}, "test_structured_zeroshot": {}, "train_aidayago2": {}, "validation_aidayago2": {}, "test_aidayago2": {}, "validation_wned": {}, "test_wned": {}, "validation_cweb": {}, "test_cweb": {}, "train_wow": {}, "validation_wow": {}, "test_wow": {}}}, "supervised_keys": null, "builder_name": "kilt_tasks", "config_name": "all_tasks", "version": {"version_str": "1.0.0", "description": "KILT tasks training and evaluation data", "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train_fever": {"name": "train_fever", "num_bytes": 27423152, "num_examples": 104966, "dataset_name": "kilt_tasks"}, "validation_fever": {"name": "validation_fever", "num_bytes": 3945700, "num_examples": 10444, "dataset_name": "kilt_tasks"}, "test_fever": {"name": "test_fever", "num_bytes": 1357227, "num_examples": 10100, "dataset_name": "kilt_tasks"}, "train_nq": {"name": "train_nq", "num_bytes": 35533981, "num_examples": 87372, "dataset_name": "kilt_tasks"}, "validation_nq": {"name": "validation_nq", "num_bytes": 6587478, "num_examples": 2837, "dataset_name": "kilt_tasks"}, "test_nq": {"name": "test_nq", "num_bytes": 379518, "num_examples": 1444, "dataset_name": "kilt_tasks"}, "train_hotpotqa": {"name": "train_hotpotqa", "num_bytes": 34838943, "num_examples": 88869, "dataset_name": "kilt_tasks"}, "validation_hotpotqa": {"name": "validation_hotpotqa", "num_bytes": 2428954, "num_examples": 5600, "dataset_name": "kilt_tasks"}, "test_hotpotqa": {"name": "test_hotpotqa", "num_bytes": 1061929, "num_examples": 5569, "dataset_name": "kilt_tasks"}, "train_triviaqa": {"name": "train_triviaqa", "num_bytes": 62374231, "num_examples": 61844, "dataset_name": "kilt_tasks"}, "validation_triviaqa": {"name": "validation_triviaqa", "num_bytes": 5962201, "num_examples": 5359, "dataset_name": "kilt_tasks"}, "test_triviaqa": {"name": "test_triviaqa", "num_bytes": 547068, "num_examples": 6586, "dataset_name": "kilt_tasks"}, "train_eli5": {"name": "train_eli5", "num_bytes": 531158920, "num_examples": 272634, "dataset_name": "kilt_tasks"}, "validation_eli5": {"name": "validation_eli5", "num_bytes": 13833241, "num_examples": 1507, "dataset_name": "kilt_tasks"}, "test_eli5": {"name": "test_eli5", "num_bytes": 126907, "num_examples": 600, "dataset_name": "kilt_tasks"}, "train_trex": {"name": "train_trex", "num_bytes": 1213971546, "num_examples": 2284168, "dataset_name": "kilt_tasks"}, "validation_trex": {"name": "validation_trex", "num_bytes": 2630523, "num_examples": 5000, "dataset_name": "kilt_tasks"}, "test_trex": {"name": "test_trex", "num_bytes": 915477, "num_examples": 5000, "dataset_name": "kilt_tasks"}, "train_structured_zeroshot": {"name": "train_structured_zeroshot", "num_bytes": 55988052, "num_examples": 147909, "dataset_name": "kilt_tasks"}, "validation_structured_zeroshot": {"name": "validation_structured_zeroshot", "num_bytes": 1848289, "num_examples": 3724, "dataset_name": "kilt_tasks"}, "test_structured_zeroshot": {"name": "test_structured_zeroshot", "num_bytes": 1297614, "num_examples": 4966, "dataset_name": "kilt_tasks"}, "train_aidayago2": {"name": "train_aidayago2", "num_bytes": 69671936, "num_examples": 18395, "dataset_name": "kilt_tasks"}, "validation_aidayago2": {"name": "validation_aidayago2", "num_bytes": 20932848, "num_examples": 4784, "dataset_name": "kilt_tasks"}, "test_aidayago2": {"name": "test_aidayago2", "num_bytes": 14350869, "num_examples": 4463, "dataset_name": "kilt_tasks"}, "validation_wned": {"name": "validation_wned", "num_bytes": 12794390, "num_examples": 3396, "dataset_name": "kilt_tasks"}, "test_wned": {"name": "test_wned", "num_bytes": 13187270, "num_examples": 3376, "dataset_name": "kilt_tasks"}, "validation_cweb": {"name": "validation_cweb", "num_bytes": 90041126, "num_examples": 5599, "dataset_name": "kilt_tasks"}, "test_cweb": {"name": "test_cweb", "num_bytes": 99382290, "num_examples": 5543, "dataset_name": "kilt_tasks"}, "train_wow": {"name": "train_wow", "num_bytes": 64964362, "num_examples": 94577, "dataset_name": "kilt_tasks"}, "validation_wow": {"name": "validation_wow", "num_bytes": 2142799, "num_examples": 3058, "dataset_name": "kilt_tasks"}, "test_wow": {"name": "test_wow", "num_bytes": 1432546, "num_examples": 2944, "dataset_name": "kilt_tasks"}}, "download_checksums": {"http://dl.fbaipublicfiles.com/KILT/fever-train-kilt.jsonl": {"num_bytes": 38941824, "checksum": "a42b948957680b3d316bbc9c24f3e499f0e93a3b0a8b94ca9d972d5da5758c6a"}, "http://dl.fbaipublicfiles.com/KILT/fever-dev-kilt.jsonl": {"num_bytes": 6174139, "checksum": "0132fb971e4206c8bd9da3916f0f46a30ee6610394aee1738ce6cf6644592739"}, "http://dl.fbaipublicfiles.com/KILT/fever-test_without_answers-kilt.jsonl": {"num_bytes": 838585, "checksum": "d95e0a7086c1d5687984460aa9d5b82b3afb58972132c8cc04a75dfd23cb1a86"}, "http://dl.fbaipublicfiles.com/KILT/nq-train-kilt.jsonl": {"num_bytes": 51895886, "checksum": "13629ef9f0c4bb7b7b237ce08fae2334acb32a2c1bf69db2350021cad46188a4"}, "http://dl.fbaipublicfiles.com/KILT/nq-dev-kilt.jsonl": {"num_bytes": 7936566, "checksum": "1bcdb150fb10504bd80b915730cde1b384c21647bcf14a4592e4e5df42cc6dfb"}, "http://dl.fbaipublicfiles.com/KILT/nq-test_without_answers-kilt.jsonl": {"num_bytes": 334047, "checksum": "1232f95e7d008669930c441444aa470d54678d886335175e2227455716309e15"}, "http://dl.fbaipublicfiles.com/KILT/hotpotqa-train-kilt.jsonl": {"num_bytes": 52767068, "checksum": "7ba73ede642336703d1235c63ecb4c20ae90c7eb19d36302bffa7eb1ec7f9153"}, "http://dl.fbaipublicfiles.com/KILT/hotpotqa-dev-kilt.jsonl": {"num_bytes": 3971321, "checksum": "7f2efbc22c462eb5710e5ecd99447702e7f48203eecd4489bf32a332389d70b6"}, "http://dl.fbaipublicfiles.com/KILT/hotpotqa-test_without_answers-kilt.jsonl": {"num_bytes": 778249, "checksum": "656d51f53796e3ea2a1f808a22160e7363a24a05f1ac7dd094d25659d963ef51"}, "http://dl.fbaipublicfiles.com/KILT/triviaqa-train_id-kilt.jsonl": {"num_bytes": 101613288, "checksum": "509e627a5e606d3b8018c04cd99415d7dca90d7b9a9f491ba0cf2db8c7e999ad"}, "http://dl.fbaipublicfiles.com/KILT/triviaqa-dev_id-kilt.jsonl": {"num_bytes": 9809706, "checksum": "bb505d8437100589de72256bf0d106a4a72da01df5a977fbc778688b8621d5a6"}, "http://dl.fbaipublicfiles.com/KILT/triviaqa-test_id_without_answers-kilt.jsonl": {"num_bytes": 123354, "checksum": "a6904eeb81a269cc675152c72b31c5b3361b40bf277a2973f2836670c7de4a19"}, "http://dl.fbaipublicfiles.com/KILT/eli5-train-kilt.jsonl": {"num_bytes": 548249898, "checksum": "66f19183bda5f5185fe98a5fe84925f6802d9b53a5bb12728ded760fd80ef0a0"}, "http://dl.fbaipublicfiles.com/KILT/eli5-dev-kilt.jsonl": {"num_bytes": 14149811, "checksum": "10387a08e3277d3e5722305cef8e739279c83de0218e7b3db76b301949822303"}, "http://dl.fbaipublicfiles.com/KILT/eli5-test_without_answers-kilt.jsonl": {"num_bytes": 98951, "checksum": "43f82439f15f3141d115c06fb0dc0697a2a6313705dbdec6668eaefe9925a2c3"}, "http://dl.fbaipublicfiles.com/KILT/trex-train-kilt.jsonl": {"num_bytes": 1752330104, "checksum": "724a8a0fd1f99fe9b15cf6309d7ce63dcb7cd31f33aea717ff26e6d6b6fbdd97"}, "http://dl.fbaipublicfiles.com/KILT/trex-dev-kilt.jsonl": {"num_bytes": 3803558, "checksum": "7ee1a2cd351d928d01ede1763ae8d6ae442a00b07deef80e347852f20a029e5f"}, "http://dl.fbaipublicfiles.com/KILT/trex-test_without_answers-kilt.jsonl": {"num_bytes": 895854, "checksum": "0a2c6a31ebea567e7a8b31c44bf2fb59dab601c8da6de7f3a53419c7710bdc43"}, "http://dl.fbaipublicfiles.com/KILT/structured_zeroshot-train-kilt.jsonl": {"num_bytes": 71444475, "checksum": "f666359fa2b23e75d700bd1f25efd12d3bf981b1be70d8ad2d268d67e9bb3a5d"}, "http://dl.fbaipublicfiles.com/KILT/structured_zeroshot-dev-kilt.jsonl": {"num_bytes": 2266707, "checksum": "dd120ba5b62c6499f4e79bf5bf5a908eb96b583ba9bf8c76a4535ac21c5dac8d"}, "http://dl.fbaipublicfiles.com/KILT/structured_zeroshot-test_without_answers-kilt.jsonl": {"num_bytes": 1216038, "checksum": "3353137527c6fe9e48910e2c4d94d3ac01352c52e90a618beaaa807968fd39da"}, "http://dl.fbaipublicfiles.com/KILT/aidayago2-train-kilt.jsonl": {"num_bytes": 70139831, "checksum": "398231ec4ee9fd5616456d049875e51b1e0709f00ac2d74398dd4480ae5647b2"}, "http://dl.fbaipublicfiles.com/KILT/aidayago2-dev-kilt.jsonl": {"num_bytes": 21061554, "checksum": "3f5e23e1af46c8671c9870e13ced13fd517029da4963ad9ff4b834c4b42deb52"}, "http://dl.fbaipublicfiles.com/KILT/aidayago2-test_without_answers-kilt.jsonl": {"num_bytes": 14436143, "checksum": "2ba0836d3f8c70126022a83fd78da0bf7c910fa7813996dd2afee5e38de63648"}, "http://dl.fbaipublicfiles.com/KILT/wned-dev-kilt.jsonl": {"num_bytes": 12868348, "checksum": "e8e91d120abcb3433c952541aebd8dbf0c6abe3c378bdd64cd6d2186738cfbaf"}, "http://dl.fbaipublicfiles.com/KILT/wned-test_without_answers-kilt.jsonl": {"num_bytes": 13295124, "checksum": "f7b220d404d474f617b00c4a030a87024835b72aff131016716b84feca4d95ee"}, "http://dl.fbaipublicfiles.com/KILT/cweb-dev-kilt.jsonl": {"num_bytes": 90228527, "checksum": "c98b2301818b53df2025b09801be095152244dbbe7242efbfffd018929bfb4bf"}, "http://dl.fbaipublicfiles.com/KILT/cweb-test_without_answers-kilt.jsonl": {"num_bytes": 100216209, "checksum": "18b078340bde25e42c7136c5bc80c2e3557803e40137b40230af5834e74b6c0d"}, "http://dl.fbaipublicfiles.com/KILT/wow-train-kilt.jsonl": {"num_bytes": 71861702, "checksum": "91e04200a31daadbc9178382b724dac9be88ea9e76cd7a668e7a9430fe67713a"}, "http://dl.fbaipublicfiles.com/KILT/wow-dev-kilt.jsonl": {"num_bytes": 2418241, "checksum": "3202148ecaf41e77d44a2dec2453452c1ff8b3d646727e2c55ae6d9024e8e6b6"}, "http://dl.fbaipublicfiles.com/KILT/wow-test_without_answers-kilt.jsonl": {"num_bytes": 1292018, "checksum": "f3303ec5d7def4cf178552a8ed17bd01cb23bfeccc3abde147bebfa4bade6d8b"}}, "download_size": 3067457126, "post_processing_size": 0, "dataset_size": 2393111387, "size_in_bytes": 5460568513}} No newline at end of file | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two minor comments:
- It would be nice to keep the same order for the datasets as in the paper (FC,EL,SF,NQ,D)
- wrong GitHub link https://facebookresearch.github.io/KILT/ -> https://github.com/facebookresearch/KILT
| """ | ||
|
|
||
| _DESCRIPTION = """\ | ||
| KILT tasks training and evaluation data. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as before for the order
| """ | ||
|
|
||
|
|
||
| _DATA_URLS = { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks a lot Yacine! It looks good to me, just a couple of minor cosmetic comments. But it's good to go otherwise
|
Feel free to merge when you are happy with it @yjernite :-) |
This adds Wikipedia pre-processed for KILT, as well as the task data. Only the question IDs are provided for TriviaQA, but they can easily be mapped back with:
It would be great to have the dataset by Monday, which is when the paper should land on Arxiv and @fabiopetroni is planning on tweeting about the paper and
facebookresearchrepository for the datasett