Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions stanza/models/lemma/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ def __iter__(self):
def load_doc(self, doc):
data = doc.get([TEXT, UPOS, LEMMA])
data = self.resolve_none(data)
if self.args.get('caseless', False):
data = self.lowercase_data(data)
return data

def lowercase_data(self, data):
for token in data:
token[0] = token[0].lower()
return data

def resolve_none(self, data):
Expand Down
9 changes: 9 additions & 0 deletions stanza/models/lemma/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def __init__(self, args=None, vocab=None, emb_matrix=None, model_file=None, devi
# dict-based components
self.word_dict = dict()
self.composite_dict = dict()

self.caseless = self.args.get('caseless', False)

if not self.args['dict_only']:
self.model = self.model.to(device)
if self.args.get('edit', False):
Expand Down Expand Up @@ -164,6 +167,8 @@ def predict_dict(self, pairs):
lemmas = []
for p in pairs:
w, pos = p
if self.caseless:
w = w.lower()
if (w,pos) in self.composite_dict:
lemmas += [self.composite_dict[(w,pos)]]
elif w in self.word_dict:
Expand All @@ -178,6 +183,8 @@ def skip_seq2seq(self, pairs):
skip = []
for p in pairs:
w, pos = p
if self.caseless:
w = w.lower()
if (w,pos) in self.composite_dict:
skip.append(True)
elif w in self.word_dict:
Expand All @@ -192,6 +199,8 @@ def ensemble(self, pairs, other_preds):
assert len(pairs) == len(other_preds)
for p, pred in zip(pairs, other_preds):
w, pos = p
if self.caseless:
w = w.lower()
if (w,pos) in self.composite_dict:
lemma = self.composite_dict[(w,pos)]
elif w in self.word_dict:
Expand Down
13 changes: 13 additions & 0 deletions stanza/models/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def build_argparse():
parser.add_argument('--save_dir', type=str, default='saved_models/lemma', help='Root dir for saving models.')
parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_lemmatizer.pt", help="File name to save the model")

parser.add_argument('--caseless', default=False, action='store_true', help='Lowercase everything first before processing. This will happen automatically if 100%% of the data is caseless')

parser.add_argument('--seed', type=int, default=1234)
utils.add_device_args(parser)

Expand Down Expand Up @@ -110,6 +112,13 @@ def main(args=None):
else:
evaluate(args)

def all_lowercase(doc):
for sentence in doc.sentences:
for word in sentence.words:
if word.text.lower() != word.text:
return False
return True

def build_model_filename(args):
embedding = "nocharlm"
if args['charlm'] and args['charlm_forward_file']:
Expand Down Expand Up @@ -147,6 +156,10 @@ def train(args):
logger.warning("[Skip training because no training data available...]")
return

if not args['caseless'] and all_lowercase(train_doc):
logger.info("Building a caseless model, as all of the training data is caseless")
args['caseless'] = True

# start training
# train a dictionary-based lemmatizer
logger.info("Building lemmatizer in %s", model_file)
Expand Down
57 changes: 57 additions & 0 deletions stanza/tests/lemma/test_lowercase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pytest

from stanza.models.lemmatizer import all_lowercase
from stanza.utils.conll import CoNLL

LATIN_CONLLU = """
# sent_id = train-s1
# text = unde et philosophus dicit felicitatem esse operationem perfectam.
# reference = ittb-scg-s4203
1 unde unde ADV O4 AdvType=Loc|PronType=Rel 4 advmod:lmod _ _
2 et et CCONJ O4 _ 3 advmod:emph _ _
3 philosophus philosophus NOUN B1|grn1|casA|gen1 Case=Nom|Gender=Masc|InflClass=IndEurO|Number=Sing 4 nsubj _ _
4 dicit dico VERB N3|modA|tem1|gen6 Aspect=Imp|InflClass=LatX|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ TraditionalMood=Indicativus|TraditionalTense=Praesens
5 felicitatem felicitas NOUN C1|grn1|casD|gen2 Case=Acc|Gender=Fem|InflClass=IndEurX|Number=Sing 7 nsubj _ _
6 esse sum AUX N3|modH|tem1 Aspect=Imp|Tense=Pres|VerbForm=Inf 7 cop _ _
7 operationem operatio NOUN C1|grn1|casD|gen2|vgr1 Case=Acc|Gender=Fem|InflClass=IndEurX|Number=Sing 4 ccomp _ _
8 perfectam perfectus ADJ A1|grn1|casD|gen2 Case=Acc|Gender=Fem|InflClass=IndEurA|Number=Sing 7 amod _ SpaceAfter=No
9 . . PUNCT Punc _ 4 punct _ _

# sent_id = train-s2
# text = perfectio autem operationis dependet ex quatuor.
# reference = ittb-scg-s4204
1 perfectio perfectio NOUN C1|grn1|casA|gen2 Case=Nom|Gender=Fem|InflClass=IndEurX|Number=Sing 4 nsubj _ _
2 autem autem PART O4 _ 4 discourse _ _
3 operationis operatio NOUN C1|grn1|casB|gen2|vgr1 Case=Gen|Gender=Fem|InflClass=IndEurX|Number=Sing 1 nmod _ _
4 dependet dependeo VERB K3|modA|tem1|gen6 Aspect=Imp|InflClass=LatE|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ TraditionalMood=Indicativus|TraditionalTense=Praesens
5 ex ex ADP S4|vgr2 _ 6 case _ _
6 quatuor quattuor NUM G1|gen3|vgr1 NumForm=Word|NumType=Card 4 obl:arg _ SpaceAfter=No
7 . . PUNCT Punc _ 4 punct _ _
""".lstrip()

ENG_CONLLU = """
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0007
# text = You wonder if he was manipulating the market with his bombing targets.
1 You you PRON PRP Case=Nom|Person=2|PronType=Prs 2 nsubj 2:nsubj _
2 wonder wonder VERB VBP Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin 0 root 0:root _
3 if if SCONJ IN _ 6 mark 6:mark _
4 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 6 nsubj 6:nsubj _
5 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 6 aux 6:aux _
6 manipulating manipulate VERB VBG Tense=Pres|VerbForm=Part 2 ccomp 2:ccomp _
7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _
8 market market NOUN NN Number=Sing 6 obj 6:obj _
9 with with ADP IN _ 12 case 12:case _
10 his his PRON PRP$ Case=Gen|Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 12 nmod:poss 12:nmod:poss _
11 bombing bombing NOUN NN Number=Sing 12 compound 12:compound _
12 targets target NOUN NNS Number=Plur 6 obl 6:obl:with SpaceAfter=No
13 . . PUNCT . _ 2 punct 2:punct _
""".lstrip()


def test_all_lowercase():
doc = CoNLL.conll2doc(input_str=LATIN_CONLLU)
assert all_lowercase(doc)

def test_not_all_lowercase():
doc = CoNLL.conll2doc(input_str=ENG_CONLLU)
assert not all_lowercase(doc)
30 changes: 30 additions & 0 deletions stanza/tests/pipeline/test_lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,33 @@ def test_store_results():
assert stuff == stuff2

assert az not in lemmatizer.word_dict

def test_caseless_lemmatizer():
"""
Test that setting the lemmatizer as caseless at Pipeline time lowercases the text
"""
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
# the capital letter here should throw off the lemmatizer & it won't remove the plural
# although weirdly the current English model *does* lowercase the A
doc = nlp("Jennifer has nice Antennae")
assert doc.sentences[0].words[-1].lemma == 'antennae'

nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None, lemma_caseless=True)
# with the model set to lowercasing, the word will be treated as if it were 'antennae'
doc = nlp("Jennifer has nice Antennae")
assert doc.sentences[0].words[-1].lemma == 'antenna'

def test_latin_caseless_lemmatizer():
"""
Test the Latin caseless lemmatizer
"""
nlp = stanza.Pipeline('la', package='ittb', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
lemmatizer = nlp.processors['lemma']
assert lemmatizer.config['caseless']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the published models have this flag already, and it just poofed? Otherwise don't we need to release new models with this?

Lgtm otherwise

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I redid the Latin ITTB model with this flag. It's the only one so far, but all of the others should automatically pick it up if their entire dataset is lowercased. I do wonder if that will be surprising in the case of a Chinese model which accidentally gets used on some English text, though, for example


doc = nlp("Quod Erat Demonstrandum")
expected_lemmas = "qui sum demonstro".split()
assert len(doc.sentences) == 1
assert len(doc.sentences[0].words) == 3
for word, expected in zip(doc.sentences[0].words, expected_lemmas):
assert word.lemma == expected
2 changes: 2 additions & 0 deletions stanza/tests/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
stanza.download(lang='en', model_dir=models_dir, logging_level='info')
stanza.download(lang="en", model_dir=models_dir, package=None, processors={"ner":"ncbi_disease"})
stanza.download(lang='fr', model_dir=models_dir, logging_level='info')
# Latin ITTB has no case information for the lemmatizer
stanza.download(lang='la', model_dir=models_dir, package='ittb', logging_level='info')
stanza.download(lang='zh', model_dir=models_dir, logging_level='info')
# useful not just for verifying RtL, but because the default Arabic has a unique style of xpos tags
stanza.download(lang='ar', model_dir=models_dir, logging_level='info')
Expand Down