|
22 | 22 | # Many of these are *.sgm files, which are processed to produced plain text that can be used by this script.
|
23 | 23 | # The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET}
|
24 | 24 | DATASETS = {
|
| 25 | + "wmt20/tworefs": { |
| 26 | + 'data': ['http://data.statmt.org/wmt20/translation-task/test.tgz'], |
| 27 | + 'description': 'WMT20 news test sets with two references', |
| 28 | + 'md5': ['3b1f777cfd2fb15ccf66e9bfdb2b1699'], |
| 29 | + 'de-en': ['sgm/newstest2020-deen-src.de.sgm', 'sgm/newstest2020-deen-ref.en.sgm', 'sgm/newstestB2020-deen-ref.en.sgm'], |
| 30 | + 'en-de': ['sgm/newstest2020-ende-src.en.sgm', 'sgm/newstest2020-ende-ref.de.sgm', 'sgm/newstestB2020-ende-ref.de.sgm'], |
| 31 | + 'en-zh': ['sgm/newstest2020-enzh-src.en.sgm', 'sgm/newstest2020-enzh-ref.zh.sgm', 'sgm/newstestB2020-enzh-ref.zh.sgm'], |
| 32 | + 'ru-en': ['sgm/newstest2020-ruen-src.ru.sgm', 'sgm/newstest2020-ruen-ref.en.sgm', 'sgm/newstestB2020-ruen-ref.en.sgm'], |
| 33 | + 'zh-en': ['sgm/newstest2020-zhen-src.zh.sgm', 'sgm/newstest2020-zhen-ref.en.sgm', 'sgm/newstestB2020-zhen-ref.en.sgm'], |
| 34 | + }, |
| 35 | + "wmt20": { |
| 36 | + 'data': ['http://data.statmt.org/wmt20/translation-task/test.tgz'], |
| 37 | + 'description': 'Official evaluation data for WMT20', |
| 38 | + 'md5': ['3b1f777cfd2fb15ccf66e9bfdb2b1699'], |
| 39 | + 'cs-en': ['sgm/newstest2020-csen-src.cs.sgm', 'sgm/newstest2020-csen-ref.en.sgm'], |
| 40 | + 'de-en': ['sgm/newstest2020-deen-src.de.sgm', 'sgm/newstest2020-deen-ref.en.sgm'], |
| 41 | + 'de-fr': ['sgm/newstest2020-defr-src.de.sgm', 'sgm/newstest2020-defr-ref.fr.sgm'], |
| 42 | + 'en-cs': ['sgm/newstest2020-encs-src.en.sgm', 'sgm/newstest2020-encs-ref.cs.sgm'], |
| 43 | + 'en-de': ['sgm/newstest2020-ende-src.en.sgm', 'sgm/newstest2020-ende-ref.de.sgm'], |
| 44 | + 'en-iu': ['sgm/newstest2020-eniu-src.en.sgm', 'sgm/newstest2020-eniu-ref.iu.sgm'], |
| 45 | + 'en-ja': ['sgm/newstest2020-enja-src.en.sgm', 'sgm/newstest2020-enja-ref.ja.sgm'], |
| 46 | + 'en-km': ['sgm/newstest2020-enkm-src.en.sgm', 'sgm/newstest2020-enkm-ref.km.sgm'], |
| 47 | + 'en-pl': ['sgm/newstest2020-enpl-src.en.sgm', 'sgm/newstest2020-enpl-ref.pl.sgm'], |
| 48 | + 'en-ps': ['sgm/newstest2020-enps-src.en.sgm', 'sgm/newstest2020-enps-ref.ps.sgm'], |
| 49 | + 'en-ru': ['sgm/newstest2020-enru-src.en.sgm', 'sgm/newstest2020-enru-ref.ru.sgm'], |
| 50 | + 'en-ta': ['sgm/newstest2020-enta-src.en.sgm', 'sgm/newstest2020-enta-ref.ta.sgm'], |
| 51 | + 'en-zh': ['sgm/newstest2020-enzh-src.en.sgm', 'sgm/newstest2020-enzh-ref.zh.sgm'], |
| 52 | + 'fr-de': ['sgm/newstest2020-frde-src.fr.sgm', 'sgm/newstest2020-frde-ref.de.sgm'], |
| 53 | + 'iu-en': ['sgm/newstest2020-iuen-src.iu.sgm', 'sgm/newstest2020-iuen-ref.en.sgm'], |
| 54 | + 'ja-en': ['sgm/newstest2020-jaen-src.ja.sgm', 'sgm/newstest2020-jaen-ref.en.sgm'], |
| 55 | + 'km-en': ['sgm/newstest2020-kmen-src.km.sgm', 'sgm/newstest2020-kmen-ref.en.sgm'], |
| 56 | + 'pl-en': ['sgm/newstest2020-plen-src.pl.sgm', 'sgm/newstest2020-plen-ref.en.sgm'], |
| 57 | + 'ps-en': ['sgm/newstest2020-psen-src.ps.sgm', 'sgm/newstest2020-psen-ref.en.sgm'], |
| 58 | + 'ru-en': ['sgm/newstest2020-ruen-src.ru.sgm', 'sgm/newstest2020-ruen-ref.en.sgm'], |
| 59 | + 'ta-en': ['sgm/newstest2020-taen-src.ta.sgm', 'sgm/newstest2020-taen-ref.en.sgm'], |
| 60 | + 'zh-en': ['sgm/newstest2020-zhen-src.zh.sgm', 'sgm/newstest2020-zhen-ref.en.sgm'], |
| 61 | + }, |
25 | 62 | 'mtnt2019': {
|
26 | 63 | 'data': ['http://www.cs.cmu.edu/~pmichel1/hosting/MTNT2019.tar.gz'],
|
27 | 64 | 'description': 'Test set for the WMT 19 robustness shared task',
|
|
78 | 115 | 'data': ['http://data.statmt.org/wmt19/translation-task/test.tgz'],
|
79 | 116 | 'description': 'Official evaluation data.',
|
80 | 117 | 'md5': ['84de7162d158e28403103b01aeefc39a'],
|
| 118 | + 'citation': r"""@proceedings{ws-2019-machine, |
| 119 | + title = "Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)", |
| 120 | + editor = "Bojar, Ond{\v{r}}ej and |
| 121 | + Chatterjee, Rajen and |
| 122 | + Federmann, Christian and |
| 123 | + Fishel, Mark and |
| 124 | + Graham, Yvette and |
| 125 | + Haddow, Barry and |
| 126 | + Huck, Matthias and |
| 127 | + Yepes, Antonio Jimeno and |
| 128 | + Koehn, Philipp and |
| 129 | + Martins, Andr{\'e} and |
| 130 | + Monz, Christof and |
| 131 | + Negri, Matteo and |
| 132 | + N{\'e}v{\'e}ol, Aur{\'e}lie and |
| 133 | + Neves, Mariana and |
| 134 | + Post, Matt and |
| 135 | + Turchi, Marco and |
| 136 | + Verspoor, Karin", |
| 137 | + month = aug, |
| 138 | + year = "2019", |
| 139 | + address = "Florence, Italy", |
| 140 | + publisher = "Association for Computational Linguistics", |
| 141 | + url = "https://www.aclweb.org/anthology/W19-5200", |
| 142 | +}""", |
81 | 143 | 'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm'],
|
82 | 144 | 'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm'],
|
83 | 145 | 'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm'],
|
|
0 commit comments