huggingface
diff --git a/‎src/transformers/generation_tf_logits_process.py
Lines changed: 2 additions & 1 deletion b/‎src/transformers/generation_tf_logits_process.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/transformers/generation_tf_utils.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/generation_tf_utils.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/albert/modeling_tf_albert.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/albert/modeling_tf_albert.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/bart/modeling_tf_bart.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/bart/modeling_tf_bart.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/bert/modeling_tf_bert.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/bert/modeling_tf_bert.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/blenderbot/modeling_tf_blenderbot.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/blenderbot/modeling_tf_blenderbot.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/clip/modeling_tf_clip.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/clip/modeling_tf_clip.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/convbert/modeling_tf_convbert.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/convbert/modeling_tf_convbert.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/ctrl/modeling_tf_ctrl.py
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/ctrl/modeling_tf_ctrl.py
Lines changed: 2 additions & 2 deletions
@@ -19,6 +19,7 @@
 import numpy as np
 import tensorflow as tf
 
+from .tf_utils import stable_softmax
 from .utils import add_start_docstrings
 from .utils.logging import get_logger
 
@@ -166,7 +167,7 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
         topk_scores, topk_indices = tf.math.top_k(scores, scores.shape[-1])
 
         mask_scores = tf.fill(scores.shape, self.filter_value)
-        cumulative_probs = tf.math.cumsum(tf.nn.softmax(topk_scores, axis=-1), axis=-1)
+        cumulative_probs = tf.math.cumsum(stable_softmax(topk_scores, axis=-1), axis=-1)
         score_mask = cumulative_probs < self.top_p
 
         # Also include the token that is higher than top_p (the first false = shift and insert a True on the left)
 
@@ -34,7 +34,7 @@
     TFTopKLogitsWarper,
     TFTopPLogitsWarper,
 )
-from .tf_utils import shape_list
+from .tf_utils import shape_list, stable_softmax
 from .utils import ModelOutput, logging
 
 
@@ -3060,7 +3060,7 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In
             logits, sorted_indices, axis=-1, batch_dims=1
         )  # expects logits to be of dim (batch_size, vocab_size)
 
-        cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
+        cumulative_probs = tf.math.cumsum(stable_softmax(sorted_logits, axis=-1), axis=-1)
 
         # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
         sorted_indices_to_remove = cumulative_probs > top_p
 
@@ -44,7 +44,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -259,7 +259,7 @@ def call(
             attention_scores = tf.add(attention_scores, attention_mask)
 
         # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
 
@@ -40,7 +40,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -244,7 +244,7 @@ def call(
             attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
             attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
 
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
 
         if layer_head_mask is not None:
             # The tf.debugging asserts are not compliant with XLA then they
 
@@ -49,7 +49,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -322,7 +322,7 @@ def call(
             attention_scores = tf.add(attention_scores, attention_mask)
 
         # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
 
@@ -40,7 +40,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -245,7 +245,7 @@ def call(
             attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
             attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
 
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
 
         if layer_head_mask is not None:
             # The tf.debugging asserts are not compliant with XLA then they
 
@@ -39,7 +39,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -245,7 +245,7 @@ def call(
             attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
             attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
 
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
 
         if layer_head_mask is not None:
             # The tf.debugging asserts are not compliant with XLA then they
 
@@ -34,7 +34,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -333,7 +333,7 @@ def call(
             attention_scores = tf.add(attention_scores, attention_mask)
 
         # Normalize the attention scores to probabilities.
-        _attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        _attention_probs = stable_softmax(logits=attention_scores, axis=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
 
@@ -42,7 +42,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
@@ -228,7 +228,7 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai
 
         conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer)
         conv_kernel_layer = tf.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1])
-        conv_kernel_layer = tf.nn.softmax(conv_kernel_layer, axis=1)
+        conv_kernel_layer = stable_softmax(conv_kernel_layer, axis=1)
 
         paddings = tf.constant(
             [
@@ -270,7 +270,7 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+        attention_probs = stable_softmax(attention_scores, axis=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
 
@@ -31,7 +31,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ctrl import CTRLConfig
 
@@ -79,7 +79,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
         attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype)
         scaled_attention_logits = scaled_attention_logits + attention_mask
 
-    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
+    attention_weights = stable_softmax(scaled_attention_logits, axis=-1)
 
     # Mask heads if we want to
     if head_mask is not None: