PaddlePaddle · SigureMo · Jun 23, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/python/paddle/optimizer/radam.py b/python/paddle/optimizer/radam.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import warnings
+from typing import TYPE_CHECKING, Sequence
 
 from paddle import _C_ops
 from paddle.base.libpaddle import DataType
@@ -24,6 +27,22 @@
 )
 from .optimizer import Optimizer
 
+if TYPE_CHECKING:
+    from typing_extensions import NotRequired
+
+    from paddle import Tensor
+    from paddle.nn.clip import GradientClipBase
+    from paddle.optimizer.lr import LRScheduler
+    from paddle.regularizer import WeightDecayRegularizer
+
+    from .optimizer import _ParameterConfig
+
+    class _RAdamParameterConfig(_ParameterConfig):
+        beta1: NotRequired[float | Tensor]
+        beta2: NotRequired[float | Tensor]
+        epsilon: NotRequired[float]
+
+
 __all__ = []
 
 
@@ -56,7 +75,7 @@ class RAdam(Optimizer):
     Args:
         learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
             It can be a float value or a LRScheduler. The default value is 0.001.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
+        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
             This parameter is required in dygraph mode. And you can specify different options for
             different parameter groups such as the learning rate, weight decay, etc,
             then the parameters are list of dict. Note that the learning_rate in parameter groups
@@ -70,13 +89,13 @@ class RAdam(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-        weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor.
+        weight_decay (float|Tensor|WeightDecayRegularizer|None, optional): The weight decay coefficient, it can be float or Tensor.
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
+        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three clipping strategies
             ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
             :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
+        name (str|None, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
 
@@ -133,15 +152,17 @@ class RAdam(Optimizer):
 
     def __init__(
         self,
-        learning_rate=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1.0e-8,
-        parameters=None,
-        weight_decay=None,
-        grad_clip=None,
-        name=None,
-    ):
+        learning_rate: float | LRScheduler = 0.001,
+        beta1: float | Tensor = 0.9,
+        beta2: float | Tensor = 0.999,
+        epsilon: float = 1.0e-8,
+        parameters: Sequence[Tensor]
+        | Sequence[_RAdamParameterConfig]
+        | None = None,
+        weight_decay: float | Tensor | WeightDecayRegularizer | None = None,
+        grad_clip: GradientClipBase | None = None,
+        name: str | None = None,
+    ) -> None:
         if isinstance(learning_rate, (float, int)) and not 0.0 <= learning_rate:
             raise ValueError(
                 f"Invalid learning rate: {learning_rate}, expect learning_rate >= 0."

diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
@@ -12,14 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import warnings
+from typing import TYPE_CHECKING, Sequence
+
+from typing_extensions import NotRequired
 
 from paddle import _C_ops
 
 from ..base import framework
 from ..base.framework import in_dynamic_or_pir_mode
 from .optimizer import Optimizer
 
+if TYPE_CHECKING:
+    from typing_extensions import NotRequired
+
+    from paddle import Tensor
+    from paddle.nn.clip import GradientClipBase
+    from paddle.optimizer.lr import LRScheduler
+    from paddle.regularizer import WeightDecayRegularizer
+
+    from .optimizer import _ParameterConfig
+
+    class _RMSPropParameterConfig(_ParameterConfig):
+        epsilon: NotRequired[float]
+        momentum: NotRequired[float]
+        rho: NotRequired[float]
+        centered: NotRequired[bool]
+
+
 __all__ = []
 
 
@@ -83,24 +105,24 @@ class RMSProp(Optimizer):
           the gradient; if False, by the uncentered second moment. Setting this to
           True may help with training, but is slightly more expensive in terms of
           computation and memory. Defaults to False.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
+        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
           This parameter is required in dygraph mode. And you can specify different options for
           different parameter groups such as the learning rate, weight decay, etc,
           then the parameters are list of dict. Note that the learning_rate in parameter groups
           represents the scale of base learning_rate.
           The default value is None in static graph mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
+        weight_decay (float|WeightDecayRegularizer|None, optional): The strategy of regularization.
           It can be a float value as coeff of L2 regularization or \
           :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
           If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already,
           the regularization setting here in optimizer will be ignored for this parameter.
           Otherwise, the regularization setting here in optimizer will take effect.
           Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
+        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
           some derived class of ``GradientClipBase`` . There are three clipping strategies
           ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
           :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): This parameter is used by developers to print debugging information.
+        name (str|None, optional): This parameter is used by developers to print debugging information.
           For details, please refer to :ref:`api_guide_Name`. Default is None.
 
     Examples:
@@ -149,16 +171,18 @@ class RMSProp(Optimizer):
 
     def __init__(
         self,
-        learning_rate,
-        rho=0.95,
-        epsilon=1.0e-6,
-        momentum=0.0,
-        centered=False,
-        parameters=None,
-        weight_decay=None,
-        grad_clip=None,
-        name=None,
-    ):
+        learning_rate: float | LRScheduler,
+        rho: float = 0.95,
+        epsilon: float = 1.0e-6,
+        momentum: float = 0.0,
+        centered: bool = False,
+        parameters: Sequence[Tensor]
+        | Sequence[_RMSPropParameterConfig]
+        | None = None,
+        weight_decay: float | WeightDecayRegularizer | None = None,
+        grad_clip: GradientClipBase | None = None,
+        name: str | None = None,
+    ) -> None:
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
         if rho is None:

diff --git a/python/paddle/optimizer/rprop.py b/python/paddle/optimizer/rprop.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import warnings
+from typing import TYPE_CHECKING, Sequence
 
 from paddle import _C_ops
 from paddle.tensor.creation import to_tensor
@@ -22,6 +25,13 @@
 from ..base.framework import in_dynamic_or_pir_mode
 from .optimizer import Optimizer
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+    from paddle.nn.clip import GradientClipBase
+    from paddle.optimizer.lr import LRScheduler
+
+    from .optimizer import _ParameterConfig
+
 __all__ = []
 
 
@@ -53,20 +63,20 @@ class Rprop(Optimizer):
        \end{aligned}
 
     Parameters:
-        learning_rate (float|Tensor|LearningRateDecay, optional): The initial learning rate used to update ``Parameter``.
-            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|Tensor|LRScheduler, optional): The initial learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LRScheduler. The default value is 0.001.
         learning_rate_range (tuple, optional): The range of learning rate.
             Learning rate cannot be smaller than the first element of the tuple;
             learning rate cannot be larger than the second element of the tuple.
             The default value is (1e-5, 50).
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
+        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
             This parameter is required in dygraph mode.
             The default value is None in static graph mode, at this time all parameters will be updated.
         etas (tuple, optional): Tuple used to update learning rate.
             The first element of the tuple is the multiplicative decrease factor;
             the second element of the tuple is the multiplicative increase factor.
             The default value is (0.5, 1.2).
-        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of some derived class of ``GradientClipBase`` .
+        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of some derived class of ``GradientClipBase`` .
             There are three clipping strategies ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` ).
             Default None, meaning there is no gradient clipping.
         multi_precision (bool, optional): In mixed precision training scenarios based on GPU,
@@ -76,7 +86,7 @@ class Rprop(Optimizer):
             Finally, the updated FP32 type value will be converted to FP16 type first,
             and then assigned to the actual FP16 type parameters participating in the calculation.
             The default value is False.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property.
+        name (str|None, optional): The default value is None. Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
     Examples:
@@ -99,14 +109,14 @@ class Rprop(Optimizer):
 
     def __init__(
         self,
-        learning_rate=0.001,
-        learning_rate_range=(1e-5, 50),
-        parameters=None,
-        etas=(0.5, 1.2),
-        grad_clip=None,
-        multi_precision=False,
-        name=None,
-    ):
+        learning_rate: float | Tensor | LRScheduler = 0.001,
+        learning_rate_range: tuple[float, float] = (1e-5, 50),
+        parameters: Sequence[Tensor] | Sequence[_ParameterConfig] | None = None,
+        etas: tuple[float, float] = (0.5, 1.2),
+        grad_clip: GradientClipBase | None = None,
+        multi_precision: bool = False,
+        name: str | None = None,
+    ) -> None:
         if learning_rate is None:
             raise ValueError("learning_rate is not set")
         if (