@@ -35,6 +35,8 @@ def __init__(
3535 lora_alpha : int = 1 ,
3636 lora_dropout : float = 0.0 ,
3737 merge_weights : bool = True ,
38+ rslora : bool = False ,
39+ lora_plus_scale : float = 1.0 ,
3840 ** kwargs
3941 ):
4042 nn .Linear .__init__ (self , in_features , out_features , ** kwargs )
@@ -62,9 +64,16 @@ def __init__(
6264 shape = [r , out_features ],
6365 dtype = self ._dtype ,
6466 is_bias = False ,
65- default_initializer = nn .initializer .Constant (value = 0.0 ),
67+ attr = paddle .ParamAttr (
68+ initializer = paddle .nn .initializer .Constant (value = 0.0 ),
69+ learning_rate = lora_plus_scale ,
70+ ),
6671 )
67- self .scaling = self .lora_alpha / self .r
72+
73+ if not rslora :
74+ self .scaling = self .lora_alpha / self .r
75+ else :
76+ self .scaling = self .lora_alpha / math .sqrt (self .r )
6877
6978 # Freezing the pre-trained weight matrix
7079 self .weight .stop_gradient = True
@@ -104,6 +113,8 @@ def __init__(
104113 r : int = 0 ,
105114 lora_alpha : int = 1 ,
106115 lora_dropout : float = 0.0 ,
116+ rslora : bool = False ,
117+ lora_plus_scale : float = 1.0 ,
107118 merge_weights : bool = True ,
108119 ** kwargs
109120 ):
@@ -137,12 +148,19 @@ def __init__(
137148 shape = [r , self .out_features ],
138149 dtype = self ._dtype ,
139150 is_bias = False ,
140- default_initializer = nn .initializer .Constant (value = 0.0 ),
151+ attr = paddle .ParamAttr (
152+ initializer = paddle .nn .initializer .Constant (value = 0.0 ),
153+ learning_rate = lora_plus_scale ,
154+ ),
141155 )
156+
142157 self .lora_A .is_distributed = True
143158 self .lora_A .split_axis = 0
144159 self .lora_B .is_distributed = False
145- self .scaling = self .lora_alpha / self .r
160+ if not rslora :
161+ self .scaling = self .lora_alpha / self .r
162+ else :
163+ self .scaling = self .lora_alpha / math .sqrt (self .r )
146164
147165 # Freezing the pre-trained weight matrix
148166 self .weight .stop_gradient = True
@@ -208,6 +226,8 @@ def __init__(
208226 r : int = 0 ,
209227 lora_alpha : int = 1 ,
210228 lora_dropout : float = 0.0 ,
229+ rslora : bool = False ,
230+ lora_plus_scale : float = 1.0 ,
211231 merge_weights : bool = True ,
212232 lora_A_weight_attr : Optional [paddle .ParamAttr ] = None ,
213233 ** kwargs
@@ -241,11 +261,18 @@ def __init__(
241261 shape = [r , self .output_size_per_partition ],
242262 dtype = self ._dtype ,
243263 is_bias = False ,
244- default_initializer = nn .initializer .Constant (value = 0.0 ),
264+ attr = paddle .ParamAttr (
265+ initializer = paddle .nn .initializer .Constant (value = 0.0 ),
266+ learning_rate = lora_plus_scale ,
267+ ),
245268 )
269+
246270 self .lora_B .is_distributed = True
247271 self .lora_B .split_axis = 1
248- self .scaling = self .lora_alpha / self .r
272+ if not rslora :
273+ self .scaling = self .lora_alpha / self .r
274+ else :
275+ self .scaling = self .lora_alpha / math .sqrt (self .r )
249276
250277 # Freezing the pre-trained weight matrix
251278 self .weight .stop_gradient = True
0 commit comments