Skip to content

Commit abfbf38

Browse files
mjpostozancaglayan
andauthored
Added WMT20 newstest (#109)
* Added WMT20 newstest (#103) * updated CHANGELOG and README Co-authored-by: Ozan Caglayan <[email protected]>
1 parent b4864c3 commit abfbf38

File tree

4 files changed

+76
-1
lines changed

4 files changed

+76
-1
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# VERSION HISTORY
22

3+
- 1.4.13 (2020-07-30)
4+
- Added WMT20 newstest test sets (#103)
5+
- Make mecab3-python an extra dependency, adapt code to new mecab3-python
6+
This fixes the recent Windows installation issues as well (#104)
7+
Japanese support should now be explicitly installed through sacrebleu[ja] package.
8+
- Fix return type annotation of corpus_bleu()
9+
- Improve sentence_score's documentation, do not allow single ref string (#98)
10+
311
- 1.4.12 (2020-07-03)
412
- Fix a deployment bug (#96)
513

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ Install the Python module (Python 3 only)
1818

1919
pip3 install sacrebleu
2020

21+
In order to install Japanese tokenizer support through `mecab-python3`, you need to run the
22+
following command instead, to perform a full installation with dependencies:
23+
24+
pip3 install sacrebleu[ja]
25+
2126
Alternately, you can install from the source:
2227

2328
python3 setup.py install

sacrebleu/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# express or implied. See the License for the specific language governing
1515
# permissions and limitations under the License.
1616

17-
__version__ = '1.4.12'
17+
__version__ = '1.4.13'
1818
__description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU scores'
1919

2020

sacrebleu/dataset.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,43 @@
2222
# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script.
2323
# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET}
2424
DATASETS = {
25+
"wmt20/tworefs": {
26+
'data': ['http://data.statmt.org/wmt20/translation-task/test.tgz'],
27+
'description': 'WMT20 news test sets with two references',
28+
'md5': ['3b1f777cfd2fb15ccf66e9bfdb2b1699'],
29+
'de-en': ['sgm/newstest2020-deen-src.de.sgm', 'sgm/newstest2020-deen-ref.en.sgm', 'sgm/newstestB2020-deen-ref.en.sgm'],
30+
'en-de': ['sgm/newstest2020-ende-src.en.sgm', 'sgm/newstest2020-ende-ref.de.sgm', 'sgm/newstestB2020-ende-ref.de.sgm'],
31+
'en-zh': ['sgm/newstest2020-enzh-src.en.sgm', 'sgm/newstest2020-enzh-ref.zh.sgm', 'sgm/newstestB2020-enzh-ref.zh.sgm'],
32+
'ru-en': ['sgm/newstest2020-ruen-src.ru.sgm', 'sgm/newstest2020-ruen-ref.en.sgm', 'sgm/newstestB2020-ruen-ref.en.sgm'],
33+
'zh-en': ['sgm/newstest2020-zhen-src.zh.sgm', 'sgm/newstest2020-zhen-ref.en.sgm', 'sgm/newstestB2020-zhen-ref.en.sgm'],
34+
},
35+
"wmt20": {
36+
'data': ['http://data.statmt.org/wmt20/translation-task/test.tgz'],
37+
'description': 'Official evaluation data for WMT20',
38+
'md5': ['3b1f777cfd2fb15ccf66e9bfdb2b1699'],
39+
'cs-en': ['sgm/newstest2020-csen-src.cs.sgm', 'sgm/newstest2020-csen-ref.en.sgm'],
40+
'de-en': ['sgm/newstest2020-deen-src.de.sgm', 'sgm/newstest2020-deen-ref.en.sgm'],
41+
'de-fr': ['sgm/newstest2020-defr-src.de.sgm', 'sgm/newstest2020-defr-ref.fr.sgm'],
42+
'en-cs': ['sgm/newstest2020-encs-src.en.sgm', 'sgm/newstest2020-encs-ref.cs.sgm'],
43+
'en-de': ['sgm/newstest2020-ende-src.en.sgm', 'sgm/newstest2020-ende-ref.de.sgm'],
44+
'en-iu': ['sgm/newstest2020-eniu-src.en.sgm', 'sgm/newstest2020-eniu-ref.iu.sgm'],
45+
'en-ja': ['sgm/newstest2020-enja-src.en.sgm', 'sgm/newstest2020-enja-ref.ja.sgm'],
46+
'en-km': ['sgm/newstest2020-enkm-src.en.sgm', 'sgm/newstest2020-enkm-ref.km.sgm'],
47+
'en-pl': ['sgm/newstest2020-enpl-src.en.sgm', 'sgm/newstest2020-enpl-ref.pl.sgm'],
48+
'en-ps': ['sgm/newstest2020-enps-src.en.sgm', 'sgm/newstest2020-enps-ref.ps.sgm'],
49+
'en-ru': ['sgm/newstest2020-enru-src.en.sgm', 'sgm/newstest2020-enru-ref.ru.sgm'],
50+
'en-ta': ['sgm/newstest2020-enta-src.en.sgm', 'sgm/newstest2020-enta-ref.ta.sgm'],
51+
'en-zh': ['sgm/newstest2020-enzh-src.en.sgm', 'sgm/newstest2020-enzh-ref.zh.sgm'],
52+
'fr-de': ['sgm/newstest2020-frde-src.fr.sgm', 'sgm/newstest2020-frde-ref.de.sgm'],
53+
'iu-en': ['sgm/newstest2020-iuen-src.iu.sgm', 'sgm/newstest2020-iuen-ref.en.sgm'],
54+
'ja-en': ['sgm/newstest2020-jaen-src.ja.sgm', 'sgm/newstest2020-jaen-ref.en.sgm'],
55+
'km-en': ['sgm/newstest2020-kmen-src.km.sgm', 'sgm/newstest2020-kmen-ref.en.sgm'],
56+
'pl-en': ['sgm/newstest2020-plen-src.pl.sgm', 'sgm/newstest2020-plen-ref.en.sgm'],
57+
'ps-en': ['sgm/newstest2020-psen-src.ps.sgm', 'sgm/newstest2020-psen-ref.en.sgm'],
58+
'ru-en': ['sgm/newstest2020-ruen-src.ru.sgm', 'sgm/newstest2020-ruen-ref.en.sgm'],
59+
'ta-en': ['sgm/newstest2020-taen-src.ta.sgm', 'sgm/newstest2020-taen-ref.en.sgm'],
60+
'zh-en': ['sgm/newstest2020-zhen-src.zh.sgm', 'sgm/newstest2020-zhen-ref.en.sgm'],
61+
},
2562
'mtnt2019': {
2663
'data': ['http://www.cs.cmu.edu/~pmichel1/hosting/MTNT2019.tar.gz'],
2764
'description': 'Test set for the WMT 19 robustness shared task',
@@ -78,6 +115,31 @@
78115
'data': ['http://data.statmt.org/wmt19/translation-task/test.tgz'],
79116
'description': 'Official evaluation data.',
80117
'md5': ['84de7162d158e28403103b01aeefc39a'],
118+
'citation': r"""@proceedings{ws-2019-machine,
119+
title = "Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)",
120+
editor = "Bojar, Ond{\v{r}}ej and
121+
Chatterjee, Rajen and
122+
Federmann, Christian and
123+
Fishel, Mark and
124+
Graham, Yvette and
125+
Haddow, Barry and
126+
Huck, Matthias and
127+
Yepes, Antonio Jimeno and
128+
Koehn, Philipp and
129+
Martins, Andr{\'e} and
130+
Monz, Christof and
131+
Negri, Matteo and
132+
N{\'e}v{\'e}ol, Aur{\'e}lie and
133+
Neves, Mariana and
134+
Post, Matt and
135+
Turchi, Marco and
136+
Verspoor, Karin",
137+
month = aug,
138+
year = "2019",
139+
address = "Florence, Italy",
140+
publisher = "Association for Computational Linguistics",
141+
url = "https://www.aclweb.org/anthology/W19-5200",
142+
}""",
81143
'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm'],
82144
'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm'],
83145
'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm'],

0 commit comments

Comments
 (0)