Save optimizer state in Trainer (#1533)

ptrcklv · vincentschen · commit e878d486f74b · 2020-01-26T18:08:49.000-08:00
Addresses #1416
diff --git a/snorkel/classification/training/trainer.py b/snorkel/classification/training/trainer.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, List, Optional
 
@@ -509,3 +510,75 @@ def _reset_losses(self) -> None:
         """Reset the loss counters."""
         self.running_losses = defaultdict(float)
         self.running_counts = defaultdict(int)
+
+    def save(self, trainer_path: str) -> None:
+        """Save the trainer config to the specified file path in json format.
+
+        Parameters
+        ----------
+        trainer_path
+            The path where trainer config and optimizer state should be saved.
+        """
+
+        head, tail = os.path.split(trainer_path)
+
+        if not os.path.exists(head):
+            os.makedirs(os.path.dirname(head))
+        try:
+            torch.save(
+                {
+                    "trainer_config": self.config._asdict(),
+                    "optimizer_state_dict": self.optimizer.state_dict(),
+                },
+                trainer_path,
+            )
+        except BaseException:  # pragma: no cover
+            logging.warning("Saving failed... continuing anyway.")
+
+        logging.info(f"[{self.name}] Trainer config saved in {trainer_path}")
+
+    def load(self, trainer_path: str, model: Optional[MultitaskClassifier]) -> None:
+        """Load trainer config and optimizer state from the specified json file path to the trainer object. The optimizer state is stored, too. However, it only makes sense if loaded with the correct model again.
+
+        Parameters
+        ----------
+        trainer_path
+            The path to the saved trainer config to be loaded
+        model
+            MultitaskClassifier for which the optimizer has been set. Parameters of optimizer must fit to model parameters. This model
+            shall be the model which was fit by the stored Trainer.
+
+        Example
+        -------
+        Saving model and corresponding trainer:
+        >>> model.save('./my_saved_model_file') # doctest: +SKIP
+        >>> trainer.save('./my_saved_trainer_file') # doctest: +SKIP
+        Now we can resume training and load the saved model and trainer into new model and trainer objects:
+        >>> new_model.load('./my_saved_model_file') # doctest: +SKIP
+        >>> new_trainer.load('./my_saved_trainer_file', model=new_model) # doctest: +SKIP
+        >>> new_trainer.fit(...) # doctest: +SKIP
+        """
+
+        try:
+            saved_state = torch.load(trainer_path)
+        except BaseException:
+            if not os.path.exists(trainer_path):
+                logging.error("Loading failed... Trainer config does not exist.")
+            else:
+                logging.error(
+                    f"Loading failed... Cannot load trainer config from {trainer_path}"
+                )
+            raise
+
+        self.config = TrainerConfig(**saved_state["trainer_config"])
+        logging.info(f"[{self.name}] Trainer config loaded from {trainer_path}")
+
+        if model is not None:
+            try:
+                self._set_optimizer(model)
+                self.optimizer.load_state_dict(saved_state["optimizer_state_dict"])
+                logging.info(f"[{self.name}] Optimizer loaded from {trainer_path}")
+            except BaseException:
+                logging.error(
+                    "Loading the optimizer for your model failed. Optimizer state NOT loaded."
+                )
diff --git a/test/classification/training/test_trainer.py b/test/classification/training/test_trainer.py
@@ -1,3 +1,4 @@
+import collections
 import copy
 import json
 import os
@@ -216,6 +217,43 @@ def test_warmup(self):
         trainer.fit(model, [dataloaders[0]])
         self.assertEqual(trainer.warmup_steps, 1)
 
+    def test_save_load(self):
+        non_base_config = {"n_epochs": 2, "progress_bar": False}
+        trainer1 = Trainer(**base_config, lr_scheduler="exponential")
+        trainer1.fit(model, [dataloaders[0]])
+        trainer2 = Trainer(**non_base_config, lr_scheduler="linear")
+        trainer3 = Trainer(**non_base_config, lr_scheduler="linear")
+
+        with tempfile.NamedTemporaryFile() as fd:
+            checkpoint_path = fd.name
+            trainer1.save(checkpoint_path)
+            trainer2.load(checkpoint_path, model=model)
+            trainer3.load(checkpoint_path, None)
+
+        self.assertEqual(trainer1.config, trainer2.config)
+        self.dict_check(
+            trainer1.optimizer.state_dict(), trainer2.optimizer.state_dict()
+        )
+
+        # continue training after load
+        trainer2.fit(model, [dataloaders[0]])
+
+        # check that an inappropriate model does not load an optimizer state but a trainer config
+        self.assertEqual(trainer1.config, trainer3.config)
+        self.assertFalse(hasattr(trainer3, "optimizer"))
+        trainer3.fit(model, [dataloaders[0]])
+
+    def dict_check(self, dict1, dict2):
+        for k in dict1.keys():
+            dict1_ = dict1[k]
+            dict2_ = dict2[k]
+            if isinstance(dict1_, collections.Mapping):
+                self.dict_check(dict1_, dict2_)
+            elif isinstance(dict1_, torch.Tensor):
+                self.assertTrue(torch.eq(dict1_, dict2_,).all())
+            else:
+                self.assertEqual(dict1_, dict2_)
+
 
 if __name__ == "__main__":
     unittest.main()