Skip to content

Commit 0d1cff1

Browse files
vumichienydshiehpatrickvonplaten
authored
Add doc tests for Albert and Bigbird (#16774)
* Add doctest BERT * make fixup * fix typo * change checkpoints * make fixup * define doctest output value, update doctest for mobilebert * solve fix-copies * update QA target start index and end index * change checkpoint for docs and reuse defined variable * Update src/transformers/models/bert/modeling_tf_bert.py Co-authored-by: Yih-Dar <[email protected]> * Apply suggestions from code review Co-authored-by: Yih-Dar <[email protected]> * Apply suggestions from code review Co-authored-by: Yih-Dar <[email protected]> * make fixup * Add Doctest for Albert and Bigbird * make fixup * overwrite examples for Albert and Bigbird * Apply suggestions from code review Co-authored-by: Patrick von Platen <[email protected]> * update longer examples for Bigbird * using examples from squad_v2 * print out example text * change name token-classification-big-bird checkpoint to random Co-authored-by: Yih-Dar <[email protected]> Co-authored-by: Patrick von Platen <[email protected]>
1 parent 9fa8817 commit 0d1cff1

File tree

4 files changed

+230
-63
lines changed

4 files changed

+230
-63
lines changed

src/transformers/models/albert/modeling_albert.py

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -801,9 +801,8 @@ def forward(
801801
>>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
802802
>>> model = AlbertForPreTraining.from_pretrained("albert-base-v2")
803803
804-
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
805-
... 0
806-
>>> ) # Batch size 1
804+
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
805+
>>> # Batch size 1
807806
>>> outputs = model(input_ids)
808807
809808
>>> prediction_logits = outputs.prediction_logits
@@ -914,12 +913,7 @@ def get_input_embeddings(self):
914913
return self.albert.embeddings.word_embeddings
915914

916915
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
917-
@add_code_sample_docstrings(
918-
processor_class=_TOKENIZER_FOR_DOC,
919-
checkpoint=_CHECKPOINT_FOR_DOC,
920-
output_type=MaskedLMOutput,
921-
config_class=_CONFIG_FOR_DOC,
922-
)
916+
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
923917
def forward(
924918
self,
925919
input_ids=None,
@@ -938,6 +932,37 @@ def forward(
938932
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
939933
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
940934
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
935+
936+
Returns:
937+
938+
Example:
939+
940+
```python
941+
>>> import torch
942+
>>> from transformers import AlbertTokenizer, AlbertForMaskedLM
943+
944+
>>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
945+
>>> model = AlbertForMaskedLM.from_pretrained("albert-base-v2")
946+
947+
>>> # add mask_token
948+
>>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
949+
>>> with torch.no_grad():
950+
... logits = model(**inputs).logits
951+
952+
>>> # retrieve index of [MASK]
953+
>>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
954+
>>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
955+
>>> tokenizer.decode(predicted_token_id)
956+
'france'
957+
```
958+
959+
```python
960+
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
961+
>>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
962+
>>> outputs = model(**inputs, labels=labels)
963+
>>> round(outputs.loss.item(), 2)
964+
0.81
965+
```
941966
"""
942967
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
943968

@@ -996,9 +1021,11 @@ def __init__(self, config):
9961021
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
9971022
@add_code_sample_docstrings(
9981023
processor_class=_TOKENIZER_FOR_DOC,
999-
checkpoint=_CHECKPOINT_FOR_DOC,
1024+
checkpoint="textattack/albert-base-v2-imdb",
10001025
output_type=SequenceClassifierOutput,
10011026
config_class=_CONFIG_FOR_DOC,
1027+
expected_output="'LABEL_1'",
1028+
expected_loss=0.12,
10021029
)
10031030
def forward(
10041031
self,
@@ -1103,9 +1130,12 @@ def __init__(self, config):
11031130
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11041131
@add_code_sample_docstrings(
11051132
processor_class=_TOKENIZER_FOR_DOC,
1106-
checkpoint=_CHECKPOINT_FOR_DOC,
1133+
checkpoint="vumichien/tiny-albert",
11071134
output_type=TokenClassifierOutput,
11081135
config_class=_CONFIG_FOR_DOC,
1136+
expected_output="['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_1', "
1137+
"'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1']",
1138+
expected_loss=0.66,
11091139
)
11101140
def forward(
11111141
self,
@@ -1184,9 +1214,13 @@ def __init__(self, config):
11841214
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11851215
@add_code_sample_docstrings(
11861216
processor_class=_TOKENIZER_FOR_DOC,
1187-
checkpoint=_CHECKPOINT_FOR_DOC,
1217+
checkpoint="twmkn9/albert-base-v2-squad2",
11881218
output_type=QuestionAnsweringModelOutput,
11891219
config_class=_CONFIG_FOR_DOC,
1220+
qa_target_start_index=12,
1221+
qa_target_end_index=13,
1222+
expected_output="'a nice puppet'",
1223+
expected_loss=7.36,
11901224
)
11911225
def forward(
11921226
self,

src/transformers/models/albert/modeling_tf_albert.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -865,9 +865,8 @@ def call(
865865
>>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
866866
>>> model = TFAlbertForPreTraining.from_pretrained("albert-base-v2")
867867
868-
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[
869-
... None, :
870-
>>> ] # Batch size 1
868+
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
869+
>>> # Batch size 1
871870
>>> outputs = model(input_ids)
872871
873872
>>> prediction_logits = outputs.prediction_logits
@@ -954,12 +953,7 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
954953

955954
@unpack_inputs
956955
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
957-
@add_code_sample_docstrings(
958-
processor_class=_TOKENIZER_FOR_DOC,
959-
checkpoint=_CHECKPOINT_FOR_DOC,
960-
output_type=TFMaskedLMOutput,
961-
config_class=_CONFIG_FOR_DOC,
962-
)
956+
@replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
963957
def call(
964958
self,
965959
input_ids: Optional[TFModelInputType] = None,
@@ -979,6 +973,36 @@ def call(
979973
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
980974
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
981975
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
976+
977+
Returns:
978+
979+
Example:
980+
981+
```python
982+
>>> import tensorflow as tf
983+
>>> from transformers import AlbertTokenizer, TFAlbertForMaskedLM
984+
985+
>>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
986+
>>> model = TFAlbertForMaskedLM.from_pretrained("albert-base-v2")
987+
988+
>>> # add mask_token
989+
>>> inputs = tokenizer(f"The capital of [MASK] is Paris.", return_tensors="tf")
990+
>>> logits = model(**inputs).logits
991+
992+
>>> # retrieve index of [MASK]
993+
>>> mask_token_index = tf.where(inputs.input_ids == tokenizer.mask_token_id)[0][1]
994+
>>> predicted_token_id = tf.math.argmax(logits[0, mask_token_index], axis=-1)
995+
>>> tokenizer.decode(predicted_token_id)
996+
'france'
997+
```
998+
999+
```python
1000+
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
1001+
>>> labels = tf.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
1002+
>>> outputs = model(**inputs, labels=labels)
1003+
>>> round(float(outputs.loss), 2)
1004+
0.81
1005+
```
9821006
"""
9831007
outputs = self.albert(
9841008
input_ids=input_ids,
@@ -1043,9 +1067,11 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
10431067
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10441068
@add_code_sample_docstrings(
10451069
processor_class=_TOKENIZER_FOR_DOC,
1046-
checkpoint=_CHECKPOINT_FOR_DOC,
1070+
checkpoint="vumichien/albert-base-v2-imdb",
10471071
output_type=TFSequenceClassifierOutput,
10481072
config_class=_CONFIG_FOR_DOC,
1073+
expected_output="'LABEL_1'",
1074+
expected_loss=0.12,
10491075
)
10501076
def call(
10511077
self,
@@ -1136,9 +1162,12 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
11361162
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11371163
@add_code_sample_docstrings(
11381164
processor_class=_TOKENIZER_FOR_DOC,
1139-
checkpoint=_CHECKPOINT_FOR_DOC,
1165+
checkpoint="vumichien/tiny-albert",
11401166
output_type=TFTokenClassifierOutput,
11411167
config_class=_CONFIG_FOR_DOC,
1168+
expected_output="['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_1', "
1169+
"'LABEL_0', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1']",
1170+
expected_loss=0.66,
11421171
)
11431172
def call(
11441173
self,
@@ -1220,9 +1249,13 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
12201249
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
12211250
@add_code_sample_docstrings(
12221251
processor_class=_TOKENIZER_FOR_DOC,
1223-
checkpoint=_CHECKPOINT_FOR_DOC,
1252+
checkpoint="vumichien/albert-base-v2-squad2",
12241253
output_type=TFQuestionAnsweringModelOutput,
12251254
config_class=_CONFIG_FOR_DOC,
1255+
qa_target_start_index=12,
1256+
qa_target_end_index=13,
1257+
expected_output="'a nice puppet'",
1258+
expected_loss=7.36,
12261259
)
12271260
def call(
12281261
self,

0 commit comments

Comments
 (0)