[Typing][C-67,C-69,C-71,C-72] Add type annotations for 4 files in python/paddle/incubate/nn/functional/ (PaddlePaddle#66616)

MikhayEeer · megemini · lixcli · commit d06a456afa9a · 2024-08-05T16:59:23.000Z
---------

Co-authored-by: megemini &lt;megemini@outlook.com&gt;
diff --git a/python/paddle/incubate/nn/functional/blha_get_max_len.py b/python/paddle/incubate/nn/functional/blha_get_max_len.py
@@ -12,11 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 from paddle import _C_ops
 from paddle.framework import LayerHelper, in_dynamic_or_pir_mode
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+
 
-def blha_get_max_len(seq_lens_encoder, seq_lens_decoder, batch_size):
+def blha_get_max_len(
+    seq_lens_encoder: Tensor, seq_lens_decoder: Tensor, batch_size: Tensor
+) -> tuple[Tensor, Tensor]:
     """
     Apply Fused BlhaGetMaxLen kernel. Typically used before the block_multihead_attention operator.
 
diff --git a/python/paddle/incubate/nn/functional/fused_dot_product_attention.py b/python/paddle/incubate/nn/functional/fused_dot_product_attention.py
@@ -12,24 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from paddle import Tensor, _C_ops
 from paddle.framework import LayerHelper, in_dynamic_or_pir_mode
 
 
 def cudnn_flash_attention(
-    q,
-    k,
-    v,
-    bias=None,
-    cu_seqlen_q=None,
-    cu_seqlen_k=None,
-    scaling_factor=1.0,
-    dropout_prob=0.0,
-    training=True,
-    mask_type=None,
-    bias_type=None,
-    name=None,
-):
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    bias: Tensor | None = None,
+    cu_seqlen_q: Tensor | None = None,
+    cu_seqlen_k: Tensor | None = None,
+    scaling_factor: float = 1.0,
+    dropout_prob: float = 0.0,
+    training: bool = True,
+    mask_type: str | None = None,
+    bias_type: str | None = None,
+    name: str | None = None,
+) -> Tensor:
     r"""
     Fused Dot Product Attention. This is a fusion operator to compute scaled dot product attention in transformer
     model architecture. This operator only supports running on Ampere and Hopper GPU and need cudnn version >= 8906.
@@ -128,13 +130,13 @@ def fused_dot_product_attention(
     query: Tensor,
     key: Tensor,
     value: Tensor,
-    attn_mask: Tensor = None,
+    attn_mask: Tensor | None = None,
     dropout_p: float = 0.0,
     is_causal: bool = False,
-    scaling_factor: float = None,
+    scaling_factor: float | None = None,
     training: bool = True,
-    name: str = None,
-):
+    name: str | None = None,
+) -> Tensor:
     r"""
     Fused Dot Product Attention. This is a fusion operator to compute scaled dot product attention in transformer
     model architecture. This operator only supports running on Ampere and Hopper GPU and need cudnn version >= 8906.
diff --git a/python/paddle/incubate/nn/functional/fused_ec_moe.py b/python/paddle/incubate/nn/functional/fused_ec_moe.py
@@ -12,12 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 from paddle.base.layer_helper import LayerHelper
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+
 
 def fused_ec_moe(
-    x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type
-):
+    x: Tensor,
+    gate: Tensor,
+    bmm0_weight: Tensor,
+    bmm0_bias: Tensor,
+    bmm1_weight: Tensor,
+    bmm1_bias: Tensor,
+    act_type: str,
+) -> Tensor:
     """
     Applies fused ec_moe kernel.
     This method requires SM_ARCH in sm75, sm80, sm86.
diff --git a/python/paddle/incubate/nn/functional/fused_gate_attention.py b/python/paddle/incubate/nn/functional/fused_gate_attention.py
@@ -12,27 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 from paddle import _legacy_C_ops
 from paddle.framework import in_dynamic_mode
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+
 
 def fused_gate_attention(
-    query,
-    key=None,
-    query_weight=None,
-    key_weight=None,
-    value_weight=None,
-    qkv_weight=None,
-    gate_linear_weight=None,
-    gate_linear_bias=None,
-    out_linear_weight=None,
-    out_linear_bias=None,
-    nonbatched_bias=None,
-    attn_mask=None,
-    has_gating=True,
-    merge_qkv=True,
-    use_flash_attn=False,
-):
+    query: Tensor,
+    key: Tensor | None = None,
+    query_weight: Tensor | None = None,
+    key_weight: Tensor | None = None,
+    value_weight: Tensor | None = None,
+    qkv_weight: Tensor | None = None,
+    gate_linear_weight: Tensor | None = None,
+    gate_linear_bias: Tensor | None = None,
+    out_linear_weight: Tensor | None = None,
+    out_linear_bias: Tensor | None = None,
+    nonbatched_bias: Tensor | None = None,
+    attn_mask: Tensor | None = None,
+    has_gating: bool = True,
+    merge_qkv: bool = True,
+    use_flash_attn: bool = False,
+) -> Tensor:
     r"""
     Attention maps queries and a set of key-value pairs to outputs, and
     Gate Attention performs multiple parallel attention to jointly attending