add model weight converter command

wj-Mcat · wj-Mcat · commit d94606d169f0 · 2022-10-22T13:01:30.000Z
diff --git a/paddlenlp/commands/cli.py b/paddlenlp/commands/cli.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import os
+import json
 from typing import Type, List, Tuple
 from typer import Typer
 import shutil
 import importlib, inspect
-from dataclasses import dataclass
 from paddlenlp.transformers import AutoModel, AutoTokenizer, PretrainedModel, PretrainedTokenizer
 from paddlenlp.utils.log import logger
 from paddlenlp.utils.env import MODEL_HOME
@@ -102,6 +102,54 @@ def search(query: str):
         tabulate(tables, headers=['model type', 'model name'], tablefmt="grid"))
 
 
+@app.command()
+def convert(model_type: str,
+            config_or_model_name: str,
+            pytorch_checkpoint_path: str = 'pytorch',
+            dump_output: str = "model_state.pdparams"):
+    # convert pytorch weight file to paddle weight file
+
+    # Args:
+    #     model_type (str): the name of target paddle model name, which can be: bert, bert-base-uncased
+    #     torch_checkpoint_path (str, optional): the path of target pytorch weight file . Defaults to 'pytorch'.
+
+    # 1. resolve pytorch weight file path
+    if os.path.isdir(pytorch_checkpoint_path):
+        pytorch_checkpoint_path = os.path.join(pytorch_checkpoint_path,
+                                               "pytorch_model.bin")
+        if not os.path.isfile(pytorch_checkpoint_path):
+            raise FileNotFoundError(
+                "pytorch checkpoint file {} not found".format(
+                    pytorch_checkpoint_path))
+    elif not os.path.exists(pytorch_checkpoint_path):
+        raise FileNotFoundError("pytorch checkpoint file {} not found".format(
+            pytorch_checkpoint_path))
+
+    def resolve_configuration(model_class: Type[PretrainedModel]) -> dict:
+        if config_or_model_name in model_class.pretrained_init_configuration:
+            return model_class.pretrained_init_configuration[
+                config_or_model_name]
+        assert os.path.isfile(
+            config_or_model_name
+        ), f'can"t not find the configuration file by <{config_or_model_name}>'
+        with open(config_or_model_name, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+        return config
+
+    # 2. convert different model weight file with
+    if model_type == 'bert':
+        from paddlenlp.transformers.bert.modeling import convert_pytorch_weights, BertModel
+        config = resolve_configuration(BertModel)
+        model = BertModel(**config)
+        convert_pytorch_weights(model,
+                                pytorch_checkpoint_path=pytorch_checkpoint_path)
+    elif model_type == 'albert':
+        from paddlenlp.transformers.albert.modeling import AlbertModel
+        config = resolve_configuration(AlbertModel)
+        model = AlbertModel(**config)
+        # call `convert` method
+
+
 def main():
     """the PaddleNLPCLI entry"""
     app()
diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import warnings
+import os
 
 import paddle
 import paddle.nn as nn
@@ -23,6 +25,7 @@
     FusedTransformerEncoderLayer = None
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
+from ...utils.log import logger
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
@@ -1667,3 +1670,14 @@ def forward(self,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+def convert_pytorch_weights(model: BertPretrainedModel,
+                            pytorch_checkpoint_path: str):
+    # 1. load the pytorch model weight file
+    import torch
+    torch_weight: Dict[str, Any] = torch.load(torch_file)
+    paddle_weight = {}
+
+    # 2. load mapping configuration
+    # TODO(wj-Mcat): from existing codebase