PaddlePaddle · sijunhe · Feb 3, 2023 · Feb 2, 2023 · Feb 2, 2023 · Feb 2, 2023
diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py
@@ -233,7 +233,9 @@ def _get_model_class_from_config(cls, pretrained_model_name_or_path, config_file
             )
 
     @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, task=None, from_hf_hub=False, *model_args, **kwargs):
+    def _from_pretrained(
+        cls, pretrained_model_name_or_path, task=None, from_hf_hub=False, subfolder=None, *model_args, **kwargs
+    ):
         if task:
             if cls._task_choice:
                 cls._name_mapping = get_name_mapping(task)
@@ -251,6 +253,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, from_hf_hub=
                 config_file = hf_hub_download(
                     repo_id=pretrained_model_name_or_path,
                     filename=cls.model_config_file,
+                    subfolder=subfolder,
                     cache_dir=HF_CACHE_HOME,
                     library_name="PaddleNLP",
                     library_version=__version__,
@@ -260,6 +263,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, from_hf_hub=
                 config_file = hf_hub_download(
                     repo_id=pretrained_model_name_or_path,
                     filename=cls.legacy_model_config_file,
+                    subfolder=subfolder,
                     cache_dir=HF_CACHE_HOME,
                     library_name="PaddleNLP",
                     library_version=__version__,

diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
@@ -205,7 +205,7 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
             return tokenizer_class
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *model_args, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, subfolder=None, *model_args, **kwargs):
         """
         Creates an instance of `AutoTokenizer`. Related resources are loaded by
         specifying name of a built-in pretrained model, or a community-contributed
@@ -219,6 +219,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
                 - Name of a community-contributed pretrained model.
                 - Local directory path which contains tokenizer related resources
                   and tokenizer config file ("tokenizer_config.json").
+            from_hf_hub (bool, optional) Whether to load from HuggingFace Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+                Only works when loading from HuggingFace Hub.
             *args (tuple): position arguments for model `__init__`. If provided,
                 use these as position argument values for tokenizer initialization.
             **kwargs (dict): keyword arguments for model `__init__`. If provided,
@@ -263,6 +266,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
             config_file = hf_hub_download(
                 repo_id=pretrained_model_name_or_path,
                 filename=cls.tokenizer_config_file,
+                subfolder=subfolder,
                 cache_dir=HF_CACHE_HOME,
                 library_name="PaddleNLP",
                 library_version=__version__,

diff --git a/paddlenlp/transformers/clip/modeling.py b/paddlenlp/transformers/clip/modeling.py
@@ -498,6 +498,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
         ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", None)
         dtype = kwargs.pop("dtype", None)
         cache_dir = kwargs.pop("cache_dir", None)
+        subfolder = kwargs.pop("subfolder", None)
 
         cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
 
@@ -511,6 +512,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
                 return_unused_kwargs=True,
                 force_download=force_download,
                 from_hf_hub=from_hf_hub,
+                subfolder=subfolder,
                 **kwargs,
             )
         # Attention! we donot save this config.json
@@ -522,7 +524,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
 
         # 3. resolve model_weight file
         model_weight_file = cls._resolve_model_file_path(
-            pretrained_model_name_or_path, cache_dir=cache_dir, from_hf_hub=from_hf_hub
+            pretrained_model_name_or_path, cache_dir=cache_dir, from_hf_hub=from_hf_hub, subfolder=subfolder
         )
 
         # 4. loading the state dict

diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py
@@ -248,17 +248,18 @@ def is_standard_config(config: Union[PretrainedConfig, Dict[str, Any]]) -> bool:
     return "init_class" not in config and "architectures" in config
 
 
-def resolve_hf_config_path(repo_id: str, cache_dir: str) -> str:
+def resolve_hf_config_path(repo_id: str, cache_dir: str, subfolder=None) -> str:
     """resolve config file from hf hub
 
     Args:
         repo_id (str): the repo name from huggingface hub
         cache_dir (str): the cachedir
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
 
     Returns:
         str: the downloaded config file
     """
-    if hf_file_exists(repo_id, CONFIG_NAME):
+    if hf_file_exists(repo_id=repo_id, filename=CONFIG_NAME, subfolder=subfolder):
         file_name = CONFIG_NAME
     else:
         raise EntryNotFoundError(f"can not find the paddle/pytorch config file from: https://huggingface.co/{repo_id}")
@@ -267,6 +268,7 @@ def resolve_hf_config_path(repo_id: str, cache_dir: str) -> str:
         repo_id=repo_id,
         filename=file_name,
         cache_dir=cache_dir,
+        subfolder=subfolder,
         library_name="PaddleNLP",
         library_version=__version__,
     )
@@ -770,6 +772,7 @@ def _get_config_dict(
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         cache_dir = kwargs.pop("cache_dir", None)
         from_hf_hub = kwargs.pop("from_hf_hub", False)
+        subfolder = kwargs.pop("subfolder", None)
         cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
 
         force_download = kwargs.pop("force_download", False)
@@ -813,8 +816,9 @@ def _get_config_dict(
 
         # 4. get the configuration file from HF hub
         elif from_hf_hub:
-            resolved_config_file = resolve_hf_config_path(repo_id=pretrained_model_name_or_path, cache_dir=cache_dir)
-
+            resolved_config_file = resolve_hf_config_path(
+                repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder
+            )
         else:
             community_url = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, CONFIG_NAME)
             if url_file_exists(community_url):

diff --git a/paddlenlp/transformers/image_processing_utils.py b/paddlenlp/transformers/image_processing_utils.py
@@ -20,9 +20,12 @@
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
+from huggingface_hub import hf_hub_download
+
+from paddlenlp import __version__
 
 from ..utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock
-from ..utils.env import MODEL_HOME
+from ..utils.env import HF_CACHE_HOME, MODEL_HOME
 from ..utils.log import logger
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 
@@ -186,11 +189,16 @@ def get_image_processor_dict(
         Parameters:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
                 The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            from_hf_hub (bool, optional): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+
 
         Returns:
             `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
         """
         cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        subfolder = kwargs.pop("subfolder", None)
 
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         is_local = os.path.isdir(pretrained_model_name_or_path)
@@ -199,6 +207,16 @@ def get_image_processor_dict(
         elif os.path.isfile(pretrained_model_name_or_path):
             resolved_image_processor_file = pretrained_model_name_or_path
             is_local = True
+        elif from_hf_hub:
+            image_processor_file = IMAGE_PROCESSOR_NAME
+            resolved_image_processor_file = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename=image_processor_file,
+                cache_dir=HF_CACHE_HOME,
+                subfolder=subfolder,
+                library_name="PaddleNLP",
+                library_version=__version__,
+            )
         else:
             # Assuming from community-contributed pretrained models
             image_processor_file = COMMUNITY_MODEL_PREFIX + pretrained_model_name_or_path + "/" + IMAGE_PROCESSOR_NAME

diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
@@ -146,30 +146,35 @@ def _find_weight_file_path(
     )
 
 
-def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conversion: bool):
+def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conversion: bool, subfolder=None):
     """find the suitable weight file name
 
     Args:
         repo_id (str): repo name of huggingface hub
         cache_dir (str): cache dir for hf
         support_conversion (bool): whether support converting pytorch weight file to paddle weight file
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
     """
-    if hf_file_exists(repo_id, "model_state.pdparams"):
+    if hf_file_exists(repo_id, "model_state.pdparams", subfolder=subfolder):
         file_name = "model_state.pdparams"
-    elif hf_file_exists(repo_id, PYTORCH_WEIGHT_FILE_NAME):
+    elif hf_file_exists(repo_id, PYTORCH_WEIGHT_FILE_NAME, subfolder=subfolder):
         if not support_conversion:
             raise EntryNotFoundError(
                 f"can not download `model_state.pdparams from https://huggingface.co/{repo_id}` "
                 "and current model doesn't support conversion from pytorch weight file to paddle weight file"
             )
         file_name = PYTORCH_WEIGHT_FILE_NAME
     else:
-        raise EntryNotFoundError(f"can not find the paddle/pytorch weight file from: https://huggingface.co/{repo_id}")
+        raise EntryNotFoundError(
+            message=f"can not find the paddle/pytorch weight file from: https://huggingface.co/{repo_id}",
+            response=None,
+        )
 
     return hf_hub_download(
         repo_id=repo_id,
         filename=file_name,
         cache_dir=cache_dir,
+        subfolder=subfolder,
         library_name="PaddleNLP",
         library_version=__version__,
     )
@@ -425,7 +430,7 @@ def constructed_from_pretrained_config(cls, init_func=None) -> bool:
         return cls.config_class is not None and issubclass(cls.config_class, PretrainedConfig)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, subfolder=None, **kwargs):
         """
         Creates an instance of `PretrainedModel`. Model weights are loaded
         by specifying name of a built-in pretrained model, or a community contributed model,
@@ -441,6 +446,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
                 - Local directory path which contains model weights file("model_state.pdparams")
                   and model config file ("model_config.json").
             from_hf_hub (bool): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+                Only works when loading from Huggingface Hub.
             *args (tuple): Position arguments for model `__init__`. If provided,
                 use these as position argument values for model initialization.
             **kwargs (dict): Keyword arguments for model `__init__`. If provided,
@@ -475,7 +482,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
                 model = BertForSequenceClassification.from_pretrained('./my_bert/')
         """
         if cls.constructed_from_pretrained_config():
-            return cls.from_pretrained_v2(pretrained_model_name_or_path, from_hf_hub=from_hf_hub, *args, **kwargs)
+            return cls.from_pretrained_v2(
+                pretrained_model_name_or_path, from_hf_hub=from_hf_hub, subfolder=subfolder, *args, **kwargs
+            )
 
         resource_files = {}
         init_configuration = {}
@@ -523,6 +532,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
                 resolved_resource_files[file_id] = hf_hub_download(
                     repo_id=pretrained_model_name_or_path,
                     filename=file_path,
+                    subfolder=subfolder,
                     cache_dir=HF_CACHE_HOME,
                     library_name="PaddleNLP",
                     library_version=__version__,
@@ -951,6 +961,7 @@ def _resolve_model_file_path(
         pretrained_model_name_or_path: str,
         from_hf_hub: bool = False,
         cache_dir: str | None = None,
+        subfolder: str | None = None,
         support_conversion: bool = False,
     ) -> str:
         """resolve model target file path from `` and `cache_dir`
@@ -1033,7 +1044,10 @@ def _resolve_model_file_path(
         # 4. when it's from HF
         if from_hf_hub:
             return resolve_weight_file_from_hf_hub(
-                pretrained_model_name_or_path, cache_dir=HF_CACHE_HOME, support_conversion=support_conversion
+                pretrained_model_name_or_path,
+                cache_dir=HF_CACHE_HOME,
+                support_conversion=support_conversion,
+                subfolder=subfolder,
             )
 
         # 5. download from community or hf-hub
@@ -1225,7 +1239,9 @@ def _find_mismatched_keys(
         return model_to_load, missing_keys, unexpected_keys, mismatched_keys
 
     @classmethod
-    def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = False, *args, **kwargs):
+    def from_pretrained_v2(
+        cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str | None = None, *args, **kwargs
+    ):
         """
         Creates an instance of `PretrainedModel`. Model weights are loaded
         by specifying name of a built-in pretrained model, a pretrained model from HF Hub, a community contributed model,
@@ -1241,6 +1257,8 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
                 - Local directory path which contains model weights file("model_state.pdparams")
                   and model config file ("model_config.json").
             from_hf_hub (bool): load model from huggingface hub. Default to `False`.
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+                Only works when loading from Huggingface Hub.
             *args (tuple): Position arguments for model `__init__`. If provided,
                 use these as position argument values for model initialization.
             **kwargs (dict): Keyword arguments for model `__init__`. If provided,
@@ -1308,6 +1326,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
         model_weight_file = cls._resolve_model_file_path(
             pretrained_model_name_or_path,
             cache_dir=cache_dir,
+            subfolder=subfolder,
             from_hf_hub=from_hf_hub,
             support_conversion=support_conversion,
         )

diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py
@@ -1399,7 +1399,7 @@ def get_vocab(self) -> Dict[str, int]:
         raise NotImplementedError()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, subfolder=None, **kwargs):
         """
         Creates an instance of `PretrainedTokenizer`. Related resources are loaded
         by specifying name of a built-in pretrained model, or a community-contributed
@@ -1413,6 +1413,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
                 - Name of a community-contributed pretrained model.
                 - Local directory path which contains tokenizer related resources
                   and tokenizer config file ("tokenizer_config.json").
+            from_hf_hub (bool, optional): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+                Only works when loading from Huggingface Hub.
             *args (tuple): position arguments for model `__init__`. If provided,
                 use these as position argument values for tokenizer initialization.
             **kwargs (dict): keyword arguments for model `__init__`. If provided,
@@ -1486,6 +1489,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
                 resolved_vocab_files[file_id] = hf_hub_download(
                     repo_id=pretrained_model_name_or_path,
                     filename=file_path,
+                    subfolder=subfolder,
                     cache_dir=HF_CACHE_HOME,
                     library_name="PaddleNLP",
                     library_version=__version__,

diff --git a/paddlenlp/utils/downloader.py b/paddlenlp/utils/downloader.py
@@ -467,7 +467,9 @@ def url_file_exists(url: str) -> bool:
     return result.status_code == requests.codes.ok
 
 
-def hf_file_exists(repo_id: str, filename: str, token: Union[bool, str, None] = None) -> bool:
+def hf_file_exists(
+    repo_id: str, filename: str, token: Union[bool, str, None] = None, subfolder: Optional[str] = None
+) -> bool:
     """Check whether the HF file exists
 
     Args:
@@ -477,11 +479,12 @@ def hf_file_exists(repo_id: str, filename: str, token: Union[bool, str, None] =
             - If `True`, the token is read from the HuggingFace config folder.
             - If `False` or `None`, no token is provided.
             - If a string, it's used as the authentication token.
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
     Returns:
         bool: whether the HF file exists
     """
 
-    url = hf_hub_url(repo_id, filename)
+    url = hf_hub_url(repo_id=repo_id, filename=filename, subfolder=subfolder)
     try:
         _ = get_hf_file_metadata(
             url=url,