PaddlePaddle · sijunhe · Jan 31, 2023 · Jan 31, 2023 · Jan 31, 2023 · Jan 31, 2023
diff --git a/paddlenlp/transformers/albert/modeling.py b/paddlenlp/transformers/albert/modeling.py
@@ -15,18 +15,20 @@
 """Modeling classes for ALBERT model."""
 
 import math
-from typing import Optional, Tuple, List
-from dataclasses import dataclass
+from typing import Optional, Tuple
+
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.nn import Layer
+
 from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
 from ..model_outputs import (
     BaseModelOutput,
-    ModelOutput,
     BaseModelOutputWithPooling,
     MaskedLMOutput,
+    ModelOutput,
     MultipleChoiceModelOutput,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
@@ -48,45 +50,6 @@
 dtype_float = paddle.get_default_dtype()
 
 
-def get_activation(activation_string):
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    else:
-        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
-
-
-def mish(x):
-    return x * F.tanh(F.softplus(x))
-
-
-def linear_act(x):
-    return x
-
-
-def swish(x):
-    return x * F.sigmoid(x)
-
-
-def gelu_new(x):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * paddle.pow(x, 3.0))))
-
-
-ACT2FN = {
-    "relu": F.relu,
-    "gelu": F.gelu,
-    "gelu_new": gelu_new,
-    "tanh": F.tanh,
-    "sigmoid": F.sigmoid,
-    "mish": mish,
-    "linear": linear_act,
-    "swish": swish,
-}
-
-
 class AlbertForPreTrainingOutput(ModelOutput):
     """
     Output type of [`AlbertForPreTraining`].

diff --git a/paddlenlp/transformers/bigbird/modeling.py b/paddlenlp/transformers/bigbird/modeling.py
@@ -12,16 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-
-import numpy as np
 import paddle
-from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer
-import paddle.nn.functional as F
 import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Dropout, Layer, LayerList, LayerNorm, Linear
 
-from ..attention_utils import _convert_param_attr_to_list, MultiHeadAttention, AttentionRegistry
 from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..attention_utils import MultiHeadAttention, _convert_param_attr_to_list
 
 __all__ = [
     "BigBirdModel",
@@ -38,38 +36,6 @@
 ]
 
 
-def mish(x):
-    return x * F.tanh(F.softplus(x))
-
-
-def linear_act(x):
-    return x
-
-
-def swish(x):
-    return x * F.sigmoid(x)
-
-
-def gelu_new(x):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * paddle.pow(x, 3.0))))
-
-
-ACT2FN = {
-    "relu": F.relu,
-    "gelu": F.gelu,
-    "gelu_new": gelu_new,
-    "tanh": F.tanh,
-    "sigmoid": F.sigmoid,
-    "mish": mish,
-    "linear": linear_act,
-    "swish": swish,
-}
-
-
 class TransformerEncoderLayer(Layer):
     def __init__(
         self,

diff --git a/paddlenlp/transformers/codegen/modeling.py b/paddlenlp/transformers/codegen/modeling.py
@@ -24,11 +24,11 @@
 from ...utils.env import CONFIG_NAME
 from ...utils.log import logger
 from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
 from ..model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
 )
-from ..nezha.modeling import ACT2FN
 from .configuration import (
     CODEGEN_PRETRAINED_INIT_CONFIGURATION,
     CODEGEN_PRETRAINED_RESOURCE_FILES_MAP,

diff --git a/paddlenlp/transformers/convbert/modeling.py b/paddlenlp/transformers/convbert/modeling.py
@@ -17,8 +17,9 @@
 import paddle.nn.functional as F
 from paddle import tensor
 from paddle.nn import Layer
-from ..electra.modeling import get_activation
+
 from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
 
 __all__ = [
     "ConvBertModel",

diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
@@ -13,23 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple
-from dataclasses import dataclass
+from typing import Optional
+
 import paddle
-from paddle import Tensor
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.nn import TransformerEncoderLayer, TransformerEncoder
-from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle import Tensor
+from paddle.nn import TransformerEncoder, TransformerEncoderLayer
 
 from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
 from ..model_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
-    QuestionAnsweringModelOutput,
-    MultipleChoiceModelOutput,
-    MaskedLMOutput,
     tuple_output,
 )
 
@@ -53,36 +52,6 @@
 ]
 
 
-def get_activation(activation_string):
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    else:
-        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
-
-
-def mish(x):
-    return x * F.tanh(F.softplus(x))
-
-
-def linear_act(x):
-    return x
-
-
-def swish(x):
-    return x * F.sigmoid(x)
-
-
-ACT2FN = {
-    "relu": F.relu,
-    "gelu": F.gelu,
-    "tanh": F.tanh,
-    "sigmoid": F.sigmoid,
-    "mish": mish,
-    "linear": linear_act,
-    "swish": swish,
-}
-
-
 class ElectraEmbeddings(nn.Layer):
     """Construct the embeddings from word, position and token_type embeddings."""
 

diff --git a/paddlenlp/transformers/fnet/modeling.py b/paddlenlp/transformers/fnet/modeling.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 """Modeling classes for FNet model."""
 
-import math
 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
-from functools import partial
 from paddle.nn import Layer
+
 from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
 
 __all__ = [
     "FNetPretrainedModel",
@@ -34,45 +33,6 @@
 ]
 
 
-def get_activation(activation_string):
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    else:
-        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
-
-
-def mish(x):
-    return x * F.tanh(F.softplus(x))
-
-
-def linear_act(x):
-    return x
-
-
-def swish(x):
-    return x * F.sigmoid(x)
-
-
-def gelu_new(x):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * paddle.pow(x, 3.0))))
-
-
-ACT2FN = {
-    "relu": F.relu,
-    "gelu": F.gelu,
-    "gelu_new": gelu_new,
-    "tanh": F.tanh,
-    "sigmoid": F.sigmoid,
-    "mish": mish,
-    "linear": linear_act,
-    "swish": swish,
-}
-
-
 class FNetBasicOutput(Layer):
     def __init__(self, hidden_size, layer_norm_eps):
         super().__init__()