Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
e38da5b
fix
enkilee Jan 29, 2024
36c0063
fix
enkilee Feb 1, 2024
79b74c3
fix
enkilee Feb 1, 2024
80c2c53
Merge branch 'develop' into PIR-optest-fix-27
enkilee Mar 1, 2024
a585498
Merge branch 'develop' into PIR-optest-fix-27
enkilee Mar 6, 2024
1cbd853
fix
enkilee Mar 6, 2024
8ff226d
fix
enkilee Mar 6, 2024
584ee74
fix
enkilee Mar 6, 2024
b3b2586
fix
enkilee Mar 7, 2024
51385d7
fix
enkilee Mar 8, 2024
59d0f3d
fix
enkilee Mar 8, 2024
d62cd21
fix
enkilee Mar 11, 2024
06c367f
fix
enkilee Mar 11, 2024
57a84b9
fix
enkilee Mar 11, 2024
6c0a155
fix
enkilee Mar 11, 2024
ef60899
fix
enkilee Mar 11, 2024
2ada845
fix
enkilee Mar 11, 2024
6127fe5
fix
enkilee Mar 11, 2024
abe0446
fix
enkilee Mar 11, 2024
fbc0884
fix
enkilee Mar 12, 2024
7601680
fix
enkilee Mar 12, 2024
f9c8eb9
fix
enkilee Mar 12, 2024
812afa7
fix
enkilee Mar 13, 2024
4e0ed3f
Merge branch 'develop' into PIR-optest-fix-27
enkilee Mar 13, 2024
daa5211
fix
enkilee Mar 13, 2024
cdc3bf1
Merge branch 'PIR-optest-fix-27' of https://github.com/enkilee/Paddle…
enkilee Mar 13, 2024
782fe3f
fix
enkilee Mar 13, 2024
0935545
fix
enkilee Mar 14, 2024
14d275d
fix
enkilee Mar 15, 2024
1bc577f
Merge branch 'develop' into PIR-optest-fix-27
enkilee Mar 15, 2024
af9be1c
fix
enkilee Mar 18, 2024
4428c2f
fix
enkilee Mar 18, 2024
d66d375
fix
enkilee Mar 18, 2024
1180742
fix
enkilee Mar 20, 2024
00f941c
fix
enkilee Mar 22, 2024
0a167bc
fix
enkilee Mar 23, 2024
f082a23
fix
enkilee Mar 26, 2024
36dde2a
fix
enkilee Mar 27, 2024
33a33c1
fix
enkilee Mar 27, 2024
45bebe0
fix
enkilee Mar 27, 2024
8afa4f2
fix
enkilee Apr 2, 2024
d3c0cae
fix
enkilee Apr 2, 2024
30fa5ea
Merge branch 'develop' into PIR-optest-fix-27
kangguangli Apr 7, 2024
d59ebfb
fix
enkilee Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 0 additions & 61 deletions paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,64 +169,3 @@ namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb,
ops::DistributedFusedLambOp,
ops::DistributedFusedLambOpMaker);

namespace phi {
namespace fusion {

template <typename T, typename Context>
void DistributedFusedLambKernel(const Context &dev_ctx,
const std::vector<const DenseTensor *> &param,
const std::vector<const DenseTensor *> &grad,
const paddle::optional<DenseTensor> &fp32_param,
const paddle::optional<DenseTensor> &fp32_grad,
const paddle::optional<DenseTensor> &fp16_param,
const paddle::optional<DenseTensor> &fp16_grad,
const DenseTensor &moment1,
const DenseTensor &moment2,
const DenseTensor &beta1_pow,
const DenseTensor &beta2_pow,
const DenseTensor &param_offsets,
const DenseTensor &fp32_partial_offsets,
const DenseTensor &fp16_partial_offsets,
const DenseTensor &param_info,
const DenseTensor &param_order,
const DenseTensor &learning_rate,
const DenseTensor &global_scale,
int acc_steps,
float beta1,
float beta2,
float epsilon,
float max_global_grad_norm,
float weight_decay,
bool clip_after_allreduce,
bool use_master_param_norm,
bool use_master_acc_grad,
bool is_grad_scaled_by_nranks,
bool use_hierarchical_allreduce,
int64_t nranks,
const std::vector<int> &ring_ids,
DenseTensor *fp32_param_out,
DenseTensor *fp16_param_out,
DenseTensor *fp32_acc_grad,
DenseTensor *fp16_acc_grad,
DenseTensor *moment1_out,
DenseTensor *moment2_out,
DenseTensor *beta1_pow_out,
DenseTensor *beta2_pow_out,
DenseTensor *param_out,
DenseTensor *found_inf,
DenseTensor *acc_step,
DenseTensor *stop_update,
DenseTensor *step) {
PADDLE_THROW(phi::errors::Unimplemented(
"The distributed_fused_lamb operator does not support CPU yet."));
}

} // namespace fusion
} // namespace phi

PD_REGISTER_KERNEL(distributed_fused_lamb,
CPU,
ALL_LAYOUT,
phi::fusion::DistributedFusedLambKernel,
float) {}
1 change: 1 addition & 0 deletions paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
{{"{name}", (PyCFunction)(void (*)(void)){name}, METH_VARARGS | METH_KEYWORDS, "C++ interface function for {name}."}},"""

NEED_GEN_STATIC_ONLY_APIS = [
'distributed_fused_lamb',
'distributed_fused_lamb_init',
'distributed_fused_lamb_init_',
'fetch',
Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/pir/dialect/operator/ir/ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,19 @@
data_type : fpn_rois
optional : rois_num, multi_level_rois_num

- op : distributed_fused_lamb
args : (Tensor[] param, Tensor[] grad, Tensor fp32_fused_param, Tensor fp32fusedgrad, Tensor fp16fusedparam, Tensor fp16fusedgrad, Tensor moment1, Tensor moment2, Tensor beta1pow, Tensor beta2pow, Tensor fusedparamoffsets, Tensor fp32shardfusedparamoffsets, Tensor fp16shardfusedparamoffsets, Tensor paraminfo, Tensor paramorder, Tensor learningrate, Tensor globalscale, float beta1, float beta2, float epsilon, float max_global_grad_norm, float weight_decay, bool clip_after_allreduce, int[] ring_ids= {}, int acc_steps = 1, bool use_master_param_norm = true, bool use_master_acc_grad = true, bool is_grad_scaled_by_nranks = true, int64_t nranks = 1, bool use_hierarchical_allreduce = false)
output : Tensor(fp32_fused_param_out), Tensor(fp16fusedparamout), Tensor(fp32accfusedgrad), Tensor(fp16accfusedgrad), Tensor(moment1out), Tensor(moment2out), Tensor(beta1powout), Tensor(beta2powout), Tensor[](paramout){param.size()}, Tensor(foundinf), Tensor(accstep), Tensor(stopupdate), Tensor(step)
infer_meta :
func : DistributedFusedLambInferMeta
param : [param, grad, fp32_fused_param, fp32fusedgrad, fp16fusedparam, fp16fusedgrad, moment1, moment2, beta1pow, beta2pow, fusedparamoffsets, fp32shardfusedparamoffsets, fp16shardfusedparamoffsets, paraminfo, paramorder, learningrate, globalscale, acc_steps, beta1, beta2, epsilon, max_global_grad_norm, weight_decay, clip_after_allreduce, use_master_param_norm, use_master_acc_grad, is_grad_scaled_by_nranks, use_hierarchical_allreduce, nranks, ring_ids]
kernel :
func : distributed_fused_lamb
data_type : DataType::FLOAT32
param : [param, grad, fp32_fused_param, fp32fusedgrad, fp16fusedparam, fp16fusedgrad, moment1, moment2, beta1pow, beta2pow, fusedparamoffsets, fp32shardfusedparamoffsets, fp16shardfusedparamoffsets, paraminfo, paramorder, learningrate, globalscale, acc_steps, beta1, beta2, epsilon, max_global_grad_norm, weight_decay, clip_after_allreduce, use_master_param_norm, use_master_acc_grad, is_grad_scaled_by_nranks, use_hierarchical_allreduce, nranks, ring_ids]
optional : fp32_fused_param, fp32fusedgrad, fp16fusedparam, fp16fusedgrad, fp32_fused_param_out, fp16fusedparamout, fp32accfusedgrad, fp16accfusedgrad, accstep, stopupdate
inplace : (fp32_fused_param -> fp32_fused_param_out), (fp16fusedparam -> fp16fusedparamout), (moment1 -> moment1out), (moment2 -> moment2out), (beta1pow -> beta1powout), (beta2pow -> beta2powout), (param -> paramout)

- op : distributed_fused_lamb_init
args : (Tensor[] param, Tensor[] grad, float beta1, float beta2, int[] apply_weight_decay, int alignment, int rank, int nranks)
output : Tensor(fp32_fused_param), Tensor(fp32_fused_grad), Tensor(fp16_fused_param), Tensor(fp16_fused_grad), Tensor(moment1), Tensor(moment2), Tensor(beta1_pow), Tensor(beta2_pow), Tensor(fused_param_offsets), Tensor(fp32_shard_fused_param_offsets), Tensor(fp16_shard_fused_param_offsets), Tensor(param_info), Tensor(param_order), Tensor[](param_out){param.size()}, Tensor[](master_param_out){param.size()}, Tensor[](grad_out){grad.size()}, Tensor(global_scale), Tensor(step)
Expand Down
6 changes: 6 additions & 0 deletions paddle/phi/api/yaml/op_compat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3637,6 +3637,12 @@
multi_level_rois_num: MultiLevelRoIsNum
restore_index: RestoreIndex

- op: distributed_fused_lamb
inputs:
{param: Param, grad: Grad, fp32_fused_param: FP32FusedParam, fp32fusedgrad: FP32FusedGrad, fp16fusedparam: FP16FusedParam, fp16fusedgrad: FP16FusedGrad, moment1: Moment1, moment2: Moment2, beta1pow: Beta1Pow, beta2pow: Beta2Pow, fusedparamoffsets: FusedParamOffsets, fp32shardfusedparamoffsets: FP32ShardFusedParamOffsets, fp16shardfusedparamoffsets: FP16ShardFusedParamOffsets, paraminfo: ParamInfo, paramorder: ParamOrder, learningrate: LearningRate, globalscale: GlobalScale}
outputs:
{paramout : ParamOut, fp32_fused_param_out: FP32FusedParamOut, fp16fusedparamout: FP16FusedParamOut, fp32accfusedgrad: FP32AccFusedGrad, fp16accfusedgrad: FP16AccFusedGrad, moment1out: Moment1Out, moment2out: Moment2Out, beta1powout: Beta1PowOut, beta2powout: Beta2PowOut, foundinf: FoundInf, accstep: AccStep, stopupdate: StopUpdate, step: Step}

- op: distributed_fused_lamb_init
inputs:
{param: Param, grad: Grad}
Expand Down
45 changes: 45 additions & 0 deletions paddle/phi/infermeta/multiary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,51 @@ void DGCMomentumInferMeta(const MetaTensor& param,
}
}

void DistributedFusedLambInferMeta(
const std::vector<const phi::MetaTensor*>& param,
const std::vector<const phi::MetaTensor*>& grad,
const MetaTensor& fp32_fused_param,
const MetaTensor& fp32fusedgrad,
const MetaTensor& fp16fusedparam,
const MetaTensor& fp16fusedgrad,
const MetaTensor& moment1,
const MetaTensor& moment2,
const MetaTensor& beta1pow,
const MetaTensor& beta2pow,
const MetaTensor& fusedparamoffsets,
const MetaTensor& fp32shardfusedparamoffsets,
const MetaTensor& fp16shardfusedparamoffsets,
const MetaTensor& paraminfo,
const MetaTensor& paramorder,
const MetaTensor& learningrate,
const MetaTensor& globalscale,
int acc_steps,
float beta1,
float beta2,
float epsilon,
float max_global_grad_norm,
float weight_decay,
bool clip_after_allreduce,
bool use_master_param_norm,
bool use_master_acc_grad,
bool is_grad_scaled_by_nranks,
bool use_hierarchical_allreduce,
int64_t nranks,
const std::vector<int>& ring_ids,
MetaTensor* fp32_fused_param_out,
MetaTensor* fp16fusedparamout,
MetaTensor* fp32accfusedgrad,
MetaTensor* fp16accfusedgrad,
MetaTensor* moment1out,
MetaTensor* moment2out,
MetaTensor* beta1powout,
MetaTensor* beta2powout,
std::vector<MetaTensor*> paramout,
MetaTensor* foundinf,
MetaTensor* accstep,
MetaTensor* stopupdate,
MetaTensor* step) {}

void EditDistanceInferMeta(const MetaTensor& hyps,
const MetaTensor& refs,
const MetaTensor& hypslength,
Expand Down
45 changes: 45 additions & 0 deletions paddle/phi/infermeta/multiary.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,51 @@ void DeformableConvInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());

void DistributedFusedLambInferMeta(
const std::vector<const phi::MetaTensor*>& param,
const std::vector<const phi::MetaTensor*>& grad,
const MetaTensor& fp32_fused_param,
const MetaTensor& fp32fusedgrad,
const MetaTensor& fp16fusedparam,
const MetaTensor& fp16fusedgrad,
const MetaTensor& moment1,
const MetaTensor& moment2,
const MetaTensor& beta1pow,
const MetaTensor& beta2pow,
const MetaTensor& fusedparamoffsets,
const MetaTensor& fp32shardfusedparamoffsets,
const MetaTensor& fp16shardfusedparamoffsets,
const MetaTensor& paraminfo,
const MetaTensor& paramorder,
const MetaTensor& learningrate,
const MetaTensor& globalscale,
int acc_steps,
float beta1,
float beta2,
float epsilon,
float max_global_grad_norm,
float weight_decay,
bool clip_after_allreduce,
bool use_master_param_norm,
bool use_master_acc_grad,
bool is_grad_scaled_by_nranks,
bool use_hierarchical_allreduce,
int64_t nranks,
const std::vector<int>& ring_ids,
MetaTensor* fp32_fused_param_out,
MetaTensor* fp16fusedparamout,
MetaTensor* fp32accfusedgrad,
MetaTensor* fp16accfusedgrad,
MetaTensor* moment1out,
MetaTensor* moment2out,
MetaTensor* beta1powout,
MetaTensor* beta2powout,
std::vector<MetaTensor*> paramout,
MetaTensor* foundinf,
MetaTensor* accstep,
MetaTensor* stopupdate,
MetaTensor* step);

void DGCMomentumInferMeta(const MetaTensor& param,
const MetaTensor& grad,
const MetaTensor& velocity,
Expand Down