Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 34 additions & 13 deletions python/paddle/optimizer/radam.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING, Sequence

from paddle import _C_ops
from paddle.base.libpaddle import DataType
Expand All @@ -24,6 +27,22 @@
)
from .optimizer import Optimizer

if TYPE_CHECKING:
from typing_extensions import NotRequired

from paddle import Tensor
from paddle.nn.clip import GradientClipBase
from paddle.optimizer.lr import LRScheduler
from paddle.regularizer import WeightDecayRegularizer

from .optimizer import _ParameterConfig

class _RAdamParameterConfig(_ParameterConfig):
beta1: NotRequired[float | Tensor]
beta2: NotRequired[float | Tensor]
epsilon: NotRequired[float]


__all__ = []


Expand Down Expand Up @@ -56,7 +75,7 @@ class RAdam(Optimizer):
Args:
learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LRScheduler. The default value is 0.001.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
This parameter is required in dygraph mode. And you can specify different options for
different parameter groups such as the learning rate, weight decay, etc,
then the parameters are list of dict. Note that the learning_rate in parameter groups
Expand All @@ -70,13 +89,13 @@ class RAdam(Optimizer):
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor.
weight_decay (float|Tensor|WeightDecayRegularizer|None, optional): The weight decay coefficient, it can be float or Tensor.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three clipping strategies
( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
:ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
name (str|None, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.

Expand Down Expand Up @@ -133,15 +152,17 @@ class RAdam(Optimizer):

def __init__(
self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1.0e-8,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None,
):
learning_rate: float | LRScheduler = 0.001,
beta1: float | Tensor = 0.9,
beta2: float | Tensor = 0.999,
epsilon: float = 1.0e-8,
parameters: Sequence[Tensor]
| Sequence[_RAdamParameterConfig]
| None = None,
weight_decay: float | Tensor | WeightDecayRegularizer | None = None,
grad_clip: GradientClipBase | None = None,
name: str | None = None,
) -> None:
if isinstance(learning_rate, (float, int)) and not 0.0 <= learning_rate:
raise ValueError(
f"Invalid learning rate: {learning_rate}, expect learning_rate >= 0."
Expand Down
52 changes: 38 additions & 14 deletions python/paddle/optimizer/rmsprop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,36 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING, Sequence

from typing_extensions import NotRequired

from paddle import _C_ops

from ..base import framework
from ..base.framework import in_dynamic_or_pir_mode
from .optimizer import Optimizer

if TYPE_CHECKING:
from typing_extensions import NotRequired

from paddle import Tensor
from paddle.nn.clip import GradientClipBase
from paddle.optimizer.lr import LRScheduler
from paddle.regularizer import WeightDecayRegularizer

from .optimizer import _ParameterConfig

class _RMSPropParameterConfig(_ParameterConfig):
epsilon: NotRequired[float]
momentum: NotRequired[float]
rho: NotRequired[float]
centered: NotRequired[bool]


__all__ = []


Expand Down Expand Up @@ -83,24 +105,24 @@ class RMSProp(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
This parameter is required in dygraph mode. And you can specify different options for
different parameter groups such as the learning rate, weight decay, etc,
then the parameters are list of dict. Note that the learning_rate in parameter groups
represents the scale of base learning_rate.
The default value is None in static graph mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
weight_decay (float|WeightDecayRegularizer|None, optional): The strategy of regularization.
It can be a float value as coeff of L2 regularization or \
:ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already,
the regularization setting here in optimizer will be ignored for this parameter.
Otherwise, the regularization setting here in optimizer will take effect.
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three clipping strategies
( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
:ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
name (str|None, optional): This parameter is used by developers to print debugging information.
For details, please refer to :ref:`api_guide_Name`. Default is None.

Examples:
Expand Down Expand Up @@ -149,16 +171,18 @@ class RMSProp(Optimizer):

def __init__(
self,
learning_rate,
rho=0.95,
epsilon=1.0e-6,
momentum=0.0,
centered=False,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None,
):
learning_rate: float | LRScheduler,
rho: float = 0.95,
epsilon: float = 1.0e-6,
momentum: float = 0.0,
centered: bool = False,
parameters: Sequence[Tensor]
| Sequence[_RMSPropParameterConfig]
| None = None,
weight_decay: float | WeightDecayRegularizer | None = None,
grad_clip: GradientClipBase | None = None,
name: str | None = None,
) -> None:
if learning_rate is None:
raise ValueError("learning_rate is not set.")
if rho is None:
Expand Down
36 changes: 23 additions & 13 deletions python/paddle/optimizer/rprop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING, Sequence

from paddle import _C_ops
from paddle.tensor.creation import to_tensor
Expand All @@ -22,6 +25,13 @@
from ..base.framework import in_dynamic_or_pir_mode
from .optimizer import Optimizer

if TYPE_CHECKING:
from paddle import Tensor
from paddle.nn.clip import GradientClipBase
from paddle.optimizer.lr import LRScheduler

from .optimizer import _ParameterConfig

__all__ = []


Expand Down Expand Up @@ -53,20 +63,20 @@ class Rprop(Optimizer):
\end{aligned}

Parameters:
learning_rate (float|Tensor|LearningRateDecay, optional): The initial learning rate used to update ``Parameter``.
It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
learning_rate (float|Tensor|LRScheduler, optional): The initial learning rate used to update ``Parameter``.
It can be a float value, a ``Tensor`` with a float type or a LRScheduler. The default value is 0.001.
learning_rate_range (tuple, optional): The range of learning rate.
Learning rate cannot be smaller than the first element of the tuple;
learning rate cannot be larger than the second element of the tuple.
The default value is (1e-5, 50).
parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
This parameter is required in dygraph mode.
The default value is None in static graph mode, at this time all parameters will be updated.
etas (tuple, optional): Tuple used to update learning rate.
The first element of the tuple is the multiplicative decrease factor;
the second element of the tuple is the multiplicative increase factor.
The default value is (0.5, 1.2).
grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of some derived class of ``GradientClipBase`` .
grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of some derived class of ``GradientClipBase`` .
There are three clipping strategies ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` ).
Default None, meaning there is no gradient clipping.
multi_precision (bool, optional): In mixed precision training scenarios based on GPU,
Expand All @@ -76,7 +86,7 @@ class Rprop(Optimizer):
Finally, the updated FP32 type value will be converted to FP16 type first,
and then assigned to the actual FP16 type parameters participating in the calculation.
The default value is False.
name (str, optional): The default value is None. Normally there is no need for user to set this property.
name (str|None, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` .

Examples:
Expand All @@ -99,14 +109,14 @@ class Rprop(Optimizer):

def __init__(
self,
learning_rate=0.001,
learning_rate_range=(1e-5, 50),
parameters=None,
etas=(0.5, 1.2),
grad_clip=None,
multi_precision=False,
name=None,
):
learning_rate: float | Tensor | LRScheduler = 0.001,
learning_rate_range: tuple[float, float] = (1e-5, 50),
parameters: Sequence[Tensor] | Sequence[_ParameterConfig] | None = None,
etas: tuple[float, float] = (0.5, 1.2),
grad_clip: GradientClipBase | None = None,
multi_precision: bool = False,
name: str | None = None,
) -> None:
if learning_rate is None:
raise ValueError("learning_rate is not set")
if (
Expand Down