|
12 | 12 | # See the License for the specific language governing permissions and |
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | | -import argparse |
16 | | -import math |
17 | 15 | import os |
18 | 16 | import random |
19 | 17 | import time |
| 18 | +import types |
| 19 | +from types import MethodType |
20 | 20 |
|
| 21 | +import lr |
21 | 22 | import numpy as np |
22 | 23 | import paddle |
23 | | -from visualdl import LogWriter |
24 | | -from modeling import GPTModel, GPTForPretraining, GPTPretrainingCriterion, GPTForPretrainingPipe |
25 | | -from paddlenlp.transformers import GPTTokenizer, GPTChineseTokenizer |
26 | | -from paddlenlp.utils.log import logger |
27 | | - |
28 | | -from dataset import create_pretrained_dataset |
| 24 | +import paddle.distributed as dist |
29 | 25 | from args import parse_args |
30 | | -import lr |
| 26 | +from checkpointing import load_checkpoint, save_checkpoint |
| 27 | +from dataset import create_pretrained_dataset |
| 28 | +from framework import AdamW, group_sharded_parallel, obtain_storage |
| 29 | +from modeling import ( |
| 30 | + GPTForPretraining, |
| 31 | + GPTForPretrainingPipe, |
| 32 | + GPTModel, |
| 33 | + GPTPretrainingCriterion, |
| 34 | +) |
| 35 | +from paddle import _legacy_C_ops |
31 | 36 | from paddle.distributed import fleet |
32 | 37 | from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker |
33 | | -from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients |
34 | | -import types |
35 | | -from utils import get_timers, set_timers |
36 | | -from types import MethodType |
37 | | -from paddle import _legacy_C_ops |
| 38 | +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( |
| 39 | + GroupShardedScaler, |
| 40 | +) |
| 41 | +from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ( |
| 42 | + ShardingScaler, |
| 43 | +) |
38 | 44 | from paddle.fluid.framework import core, in_dygraph_mode |
39 | | -import paddle.distributed as dist |
40 | | -from framework import assign_group_by_size, flatten_dense_tensors, obtain_storage, AdamW, group_sharded_parallel |
41 | 45 | from paddle.incubate.distributed.models import moe |
42 | | -from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler |
43 | | -from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler |
| 46 | +from utils import get_timers, set_timers |
| 47 | +from visualdl import LogWriter |
44 | 48 |
|
45 | | -from checkpointing import save_checkpoint, load_checkpoint |
| 49 | +from paddlenlp.transformers import GPTChineseTokenizer, GPTTokenizer |
| 50 | +from paddlenlp.utils.log import logger |
46 | 51 |
|
47 | 52 | MODEL_CLASSES = { |
48 | 53 | "gpt": (GPTForPretraining, GPTTokenizer), |
@@ -172,7 +177,7 @@ def unscale_method(self, optimizer): |
172 | 177 | if dist.get_world_size() > 1: |
173 | 178 | is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") |
174 | 179 | paddle.distributed.all_reduce(is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) |
175 | | - self._found_inf = is_found_inf.numpy()[0] |
| 180 | + self._found_inf = int(is_found_inf) |
176 | 181 |
|
177 | 182 |
|
178 | 183 | def all_reduce_parameters(params, group): |
@@ -437,7 +442,7 @@ def do_train(args): |
437 | 442 |
|
438 | 443 | clip = None |
439 | 444 | if args.grad_clip > 0: |
440 | | - is_expert_param_fun = lambda param: param.name in expert_fusion_names |
| 445 | + is_expert_param_fun = lambda param: param.name in expert_fusion_names # noqa: E731 |
441 | 446 | clip = moe.ClipGradByGlobalNorm( |
442 | 447 | clip_norm=args.grad_clip, |
443 | 448 | is_expert_param_func=is_expert_param_fun, |
|
0 commit comments