fix(base_trainer): gather weights in save_pretrained under zero3 (#429)

maxreciprocate · web-flow · commit 2318d046b947 · 2023-04-13T16:20:20.000-04:00
* feat(configs): make saving optimizer state optional

* fix(base_trainer): `save_pretrained` under zero3

* style

* revert(configs): revert to default, save the whole, state behaviour
diff --git a/trlx/data/configs.py b/trlx/data/configs.py
@@ -220,6 +220,7 @@ class TrainConfig:
     checkpoint_dir: str = "ckpts"
     rollout_logging_dir: Optional[str] = None
     save_best: bool = True
+    save_optimizer: bool = True
 
     tracker: Optional[str] = "wandb"
     logging_dir: Optional[str] = None
diff --git a/trlx/models/modeling_base.py b/trlx/models/modeling_base.py
@@ -198,7 +198,7 @@ def save_pretrained(self, *args, **kwargs):
                 Keyword arguments passed along to the underlying model's
                 `save_pretrained` method.
         """
-        state_dict = kwargs.pop("state_dict", None)
+        state_dict = kwargs.get("state_dict", None)
         if state_dict is None:
             state_dict = self.state_dict()
             kwargs["state_dict"] = state_dict
diff --git a/trlx/trainer/accelerate_base_trainer.py b/trlx/trainer/accelerate_base_trainer.py
@@ -277,8 +277,16 @@ def save_pretrained(self, directory: Optional[str] = None, **kwargs):
         """
         if directory is None:
             directory = os.path.join(self.config.train.checkpoint_dir, "hf_model")
+
         self.accelerator.wait_for_everyone()
-        self.accelerator.unwrap_model(self.model).save_pretrained(directory, **kwargs)
+        self.accelerator.unwrap_model(self.model).save_pretrained(
+            directory,
+            save_function=self.accelerator.save,
+            is_main_process=self.accelerator.is_main_process,
+            state_dict=self.accelerator.get_state_dict(self.model),
+            **kwargs,
+        )
+
         if self.accelerator.is_main_process:
             self.tokenizer.save_pretrained(directory)
 
@@ -540,17 +548,24 @@ def learn(self):  # noqa: C901
                     self.scheduler.step()
                     self.iter_count += 1
 
-                    if self.iter_count % self.config.train.checkpoint_interval == 0:
+                    if (
+                        self.iter_count % self.config.train.checkpoint_interval == 0
+                        or self.iter_count >= self.total_steps
+                    ):
                         subfolder = f"checkpoint_{self.iter_count:0{len(str(self.total_steps))}d}"
                         directory = os.path.join(self.config.train.checkpoint_dir, subfolder)
-                        self.save(directory)
+                        logger.info(f"Saving intermediate checkpoint into {directory}")
+                        if self.config.train.save_optimizer:
+                            self.save(directory)
+                        else:
+                            self.save_pretrained(directory)
 
                     stats["time/forward"] = forward_time
                     stats["time/backward"] = backward_time
                     for group_number, lr in enumerate(self.scheduler.get_last_lr()):
                         stats[f"learning_rate_group_{group_number}"] = lr
 
-                    if self.iter_count % self.config.train.eval_interval == 0:
+                    if self.iter_count % self.config.train.eval_interval == 0 or self.iter_count >= self.total_steps:
                         results = self.evaluate()
                         stats.update(results)
                         if ray.is_initialized():
@@ -571,29 +586,22 @@ def learn(self):  # noqa: C901
                             if torch.distributed.is_initialized():
                                 torch.distributed.all_reduce(do_save, torch.distributed.ReduceOp.MAX)
                             if do_save:
-                                best_path = f"{self.config.train.checkpoint_dir}/best_checkpoint"
-                                logger.info(f"Saving the best state so far into {best_path}")
-                                self.save(best_path)
+                                directory = os.path.join(self.config.train.checkpoint_dir, "best_checkpoint")
+                                logger.info(f"Saving the best state so far into {directory}")
+                                if self.config.train.save_optimizer:
+                                    self.save(directory)
+                                else:
+                                    self.save_pretrained(directory)
 
                     desc = " | ".join(f"{k}: {v:.2f}" for k, v in stats.items() if k.startswith("loss"))
                     tbar.set_description(f"[{desc}]")
                     tbar.update()
 
-                    if self.iter_count >= self.total_steps:
-                        subfolder = f"checkpoint_{self.iter_count:0{len(str(self.total_steps))}d}"
-                        directory = os.path.join(self.config.train.checkpoint_dir, subfolder)
-                        results = self.evaluate()
-                        stats.update(results)
-
-                        if ray.is_initialized():
-                            session.report(filter_non_scalars(stats), checkpoint=checkpoint)
-                        self.accelerator.log(stats, step=self.iter_count)
+                    self.accelerator.log(stats, step=self.iter_count)
 
-                        self.save(directory)
+                    if self.iter_count >= self.total_steps:
                         return results
 
-                    self.accelerator.log(stats, step=self.iter_count)
-
                 self.post_backward_callback()
 
             self.post_epoch_callback()