Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions stanza/models/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,9 @@ def init_from_misc(unit):
if hasattr(unit, attr):
setattr(unit, attr, value)
continue
elif key == NER:
# special case skipping NER for Words, since there is no Word NER field
continue
remaining_values.append(item)
unit._misc = "|".join(remaining_values)

Expand Down
31 changes: 31 additions & 0 deletions stanza/tests/common/test_data_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,34 @@ def test_zip_file():

doc = CoNLL.conll2doc(input_file=filename, zip_file=zip_file)
check_russian_doc(doc)

SIMPLE_NER = """
# text = Teferi's best friend is Karn
# sent_id = 0
1 Teferi _ _ _ _ 0 _ _ start_char=0|end_char=6|ner=S-PERSON
2 's _ _ _ _ 1 _ _ start_char=6|end_char=8|ner=O
3 best _ _ _ _ 2 _ _ start_char=9|end_char=13|ner=O
4 friend _ _ _ _ 3 _ _ start_char=14|end_char=20|ner=O
5 is _ _ _ _ 4 _ _ start_char=21|end_char=23|ner=O
6 Karn _ _ _ _ 5 _ _ start_char=24|end_char=28|ner=S-PERSON
""".strip()

def test_ner_conversion():
"""
Test that tokens get properly created with NER tags
"""
doc = CoNLL.conll2doc(input_str=SIMPLE_NER)
assert len(doc.sentences) == 1
sentence = doc.sentences[0]
EXPECTED_NER = ["S-PERSON", "O", "O", "O", "O", "S-PERSON"]
for token, ner in zip(sentence.tokens, EXPECTED_NER):
assert token.ner == ner
# check that the ner, start_char, end_char fields were not put on the token's misc
# those should all be set as specific fields on the token
assert not token.misc
assert len(token.words) == 1
# they should also not reach the word's misc field
assert not token.words[0].misc

conll = CoNLL.doc2conll(doc)
assert "\n".join(conll[0]) == SIMPLE_NER
72 changes: 36 additions & 36 deletions stanza/tests/pipeline/test_english_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,60 +88,60 @@
EN_DOC_CONLLU_GOLD = """
# text = Barack Obama was born in Hawaii.
# sent_id = 0
1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ start_char=0|end_char=6
2 Obama Obama PROPN NNP Number=Sing 1 flat _ start_char=7|end_char=12
3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16
4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21
5 in in ADP IN _ 6 case _ start_char=22|end_char=24
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31
7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32
1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ start_char=0|end_char=6|ner=B-PERSON
2 Obama Obama PROPN NNP Number=Sing 1 flat _ start_char=7|end_char=12|ner=E-PERSON
3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16|ner=O
4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21|ner=O
5 in in ADP IN _ 6 case _ start_char=22|end_char=24|ner=O
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31|ner=S-GPE
7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32|ner=O

# text = He was elected president in 2008.
# sent_id = 1
1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ start_char=34|end_char=36
2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ start_char=37|end_char=40
3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=41|end_char=48
4 president president NOUN NN Number=Sing 3 xcomp _ start_char=49|end_char=58
5 in in ADP IN _ 6 case _ start_char=59|end_char=61
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=62|end_char=66
7 . . PUNCT . _ 3 punct _ start_char=66|end_char=67
1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ start_char=34|end_char=36|ner=O
2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ start_char=37|end_char=40|ner=O
3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=41|end_char=48|ner=O
4 president president NOUN NN Number=Sing 3 xcomp _ start_char=49|end_char=58|ner=O
5 in in ADP IN _ 6 case _ start_char=59|end_char=61|ner=O
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=62|end_char=66|ner=S-DATE
7 . . PUNCT . _ 3 punct _ start_char=66|end_char=67|ner=O

# text = Obama attended Harvard.
# sent_id = 2
1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=69|end_char=74
2 attended attend VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ start_char=75|end_char=83
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=84|end_char=91
4 . . PUNCT . _ 2 punct _ start_char=91|end_char=92
1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=69|end_char=74|ner=S-PERSON
2 attended attend VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ start_char=75|end_char=83|ner=O
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=84|end_char=91|ner=S-ORG
4 . . PUNCT . _ 2 punct _ start_char=91|end_char=92|ner=O

""".lstrip()

EN_DOC_CONLLU_GOLD_MULTIDOC = """
# text = Barack Obama was born in Hawaii.
# sent_id = 0
1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ start_char=0|end_char=6
2 Obama Obama PROPN NNP Number=Sing 1 flat _ start_char=7|end_char=12
3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16
4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21
5 in in ADP IN _ 6 case _ start_char=22|end_char=24
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31
7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32
1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ start_char=0|end_char=6|ner=B-PERSON
2 Obama Obama PROPN NNP Number=Sing 1 flat _ start_char=7|end_char=12|ner=E-PERSON
3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16|ner=O
4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21|ner=O
5 in in ADP IN _ 6 case _ start_char=22|end_char=24|ner=O
6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31|ner=S-GPE
7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32|ner=O

# text = He was elected president in 2008.
# sent_id = 1
1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ start_char=0|end_char=2
2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ start_char=3|end_char=6
3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=7|end_char=14
4 president president NOUN NN Number=Sing 3 xcomp _ start_char=15|end_char=24
5 in in ADP IN _ 6 case _ start_char=25|end_char=27
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=28|end_char=32
7 . . PUNCT . _ 3 punct _ start_char=32|end_char=33
1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ start_char=0|end_char=2|ner=O
2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ start_char=3|end_char=6|ner=O
3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=7|end_char=14|ner=O
4 president president NOUN NN Number=Sing 3 xcomp _ start_char=15|end_char=24|ner=O
5 in in ADP IN _ 6 case _ start_char=25|end_char=27|ner=O
6 2008 2008 NUM CD NumForm=Digit|NumType=Card 3 obl _ start_char=28|end_char=32|ner=S-DATE
7 . . PUNCT . _ 3 punct _ start_char=32|end_char=33|ner=O

# text = Obama attended Harvard.
# sent_id = 2
1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=5
2 attended attend VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ start_char=6|end_char=14
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=15|end_char=22
4 . . PUNCT . _ 2 punct _ start_char=22|end_char=23
1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=5|ner=S-PERSON
2 attended attend VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ start_char=6|end_char=14|ner=O
3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=15|end_char=22|ner=S-ORG
4 . . PUNCT . _ 2 punct _ start_char=22|end_char=23|ner=O

""".lstrip()

Expand Down
23 changes: 7 additions & 16 deletions stanza/utils/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,10 @@

FIELD_NUM = 10

# TODO: unify this list with the list in common/doc.py
ID = 'id'
TEXT = 'text'
LEMMA = 'lemma'
UPOS = 'upos'
XPOS = 'xpos'
FEATS = 'feats'
HEAD = 'head'
DEPREL = 'deprel'
DEPS = 'deps'
MISC = 'misc'
NER = 'ner'
START_CHAR = 'start_char'
END_CHAR = 'end_char'
FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}

from stanza.models.common.doc import Document
from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, NER, START_CHAR, END_CHAR

FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}

class CoNLL:

Expand Down Expand Up @@ -152,9 +139,13 @@ def convert_token_dict(token_dict):
for key in token_dict:
if key == START_CHAR or key == END_CHAR:
misc.append("{}={}".format(key, token_dict[key]))
elif key == NER:
# TODO: potentially need to escape =|\ in the NER
misc.append("{}={}".format(key, token_dict[key]))
elif key == MISC:
# avoid appending a blank misc entry.
# otherwise the resulting misc field in the conll doc will wind up being blank text
# TODO: potentially need to escape =|\ in the MISC as well
if token_dict[key]:
misc.append(token_dict[key])
elif key == ID:
Expand Down