|
39 | 39 | import paddle.distributed as dist |
40 | 40 | import paddle.nn as nn |
41 | 41 | from packaging import version |
| 42 | +from paddle import framework |
| 43 | +from paddle.base import core |
42 | 44 | from paddle.distributed import fleet |
43 | 45 | from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import ( |
44 | 46 | HybridParallelOptimizer, |
@@ -1257,6 +1259,20 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, |
1257 | 1259 | logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate())) |
1258 | 1260 | logs["global_step"] = int(self.state.global_step) |
1259 | 1261 |
|
| 1262 | + divisor = 2**30 |
| 1263 | + # TODO(@gexiao): replace these codes with unified APIs in Paddle |
| 1264 | + current_device = framework._current_expected_place_() |
| 1265 | + if str(current_device) != "Place(cpu)": |
| 1266 | + device_id = current_device.get_device_id() |
| 1267 | + current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id) |
| 1268 | + current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id) |
| 1269 | + max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id) |
| 1270 | + max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id) |
| 1271 | + logs["current_memory_allocated"] = current_memory_allocated / divisor |
| 1272 | + logs["current_memory_reserved"] = current_memory_reserved / divisor |
| 1273 | + logs["max_memory_allocated"] = max_memory_allocated / divisor |
| 1274 | + logs["max_memory_reserved"] = max_memory_reserved / divisor |
| 1275 | + |
1260 | 1276 | total_train_batch_size = ( |
1261 | 1277 | self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size |
1262 | 1278 | ) |
@@ -1614,8 +1630,6 @@ def _load_rng_state(self, checkpoint): |
1614 | 1630 | random.setstate(checkpoint_rng_state["python"]) |
1615 | 1631 | np.random.set_state(checkpoint_rng_state["numpy"]) |
1616 | 1632 |
|
1617 | | - core = paddle.framework.core |
1618 | | - |
1619 | 1633 | core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"]) |
1620 | 1634 | if core.is_compiled_with_cuda(): |
1621 | 1635 | if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count(): |
|
0 commit comments