Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion paddlenlp/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,9 @@ def _get_model_class_from_config(cls, pretrained_model_name_or_path, config_file
)

@classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, task=None, from_hf_hub=False, *model_args, **kwargs):
def _from_pretrained(
cls, pretrained_model_name_or_path, task=None, from_hf_hub=False, subfolder=None, *model_args, **kwargs
):
if task:
if cls._task_choice:
cls._name_mapping = get_name_mapping(task)
Expand All @@ -251,6 +253,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, from_hf_hub=
config_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=cls.model_config_file,
subfolder=subfolder,
cache_dir=HF_CACHE_HOME,
library_name="PaddleNLP",
library_version=__version__,
Expand All @@ -260,6 +263,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, from_hf_hub=
config_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=cls.legacy_model_config_file,
subfolder=subfolder,
cache_dir=HF_CACHE_HOME,
library_name="PaddleNLP",
library_version=__version__,
Expand Down
6 changes: 5 additions & 1 deletion paddlenlp/transformers/auto/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
return tokenizer_class

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *model_args, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, subfolder=None, *model_args, **kwargs):
"""
Creates an instance of `AutoTokenizer`. Related resources are loaded by
specifying name of a built-in pretrained model, or a community-contributed
Expand All @@ -219,6 +219,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
- Name of a community-contributed pretrained model.
- Local directory path which contains tokenizer related resources
and tokenizer config file ("tokenizer_config.json").
from_hf_hub (bool, optional) Whether to load from HuggingFace Hub
subfolder (str, optional) An optional value corresponding to a folder inside the repo.
Only works when loading from HuggingFace Hub.
*args (tuple): position arguments for model `__init__`. If provided,
use these as position argument values for tokenizer initialization.
**kwargs (dict): keyword arguments for model `__init__`. If provided,
Expand Down Expand Up @@ -263,6 +266,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
config_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=cls.tokenizer_config_file,
subfolder=subfolder,
cache_dir=HF_CACHE_HOME,
library_name="PaddleNLP",
library_version=__version__,
Expand Down
4 changes: 3 additions & 1 deletion paddlenlp/transformers/clip/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", None)
dtype = kwargs.pop("dtype", None)
cache_dir = kwargs.pop("cache_dir", None)
subfolder = kwargs.pop("subfolder", None)

cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)

Expand All @@ -511,6 +512,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
return_unused_kwargs=True,
force_download=force_download,
from_hf_hub=from_hf_hub,
subfolder=subfolder,
**kwargs,
)
# Attention! we donot save this config.json
Expand All @@ -522,7 +524,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F

# 3. resolve model_weight file
model_weight_file = cls._resolve_model_file_path(
pretrained_model_name_or_path, cache_dir=cache_dir, from_hf_hub=from_hf_hub
pretrained_model_name_or_path, cache_dir=cache_dir, from_hf_hub=from_hf_hub, subfolder=subfolder
)

# 4. loading the state dict
Expand Down
12 changes: 8 additions & 4 deletions paddlenlp/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,17 +248,18 @@ def is_standard_config(config: Union[PretrainedConfig, Dict[str, Any]]) -> bool:
return "init_class" not in config and "architectures" in config


def resolve_hf_config_path(repo_id: str, cache_dir: str) -> str:
def resolve_hf_config_path(repo_id: str, cache_dir: str, subfolder=None) -> str:
"""resolve config file from hf hub

Args:
repo_id (str): the repo name from huggingface hub
cache_dir (str): the cachedir
subfolder (str, optional) An optional value corresponding to a folder inside the repo.

Returns:
str: the downloaded config file
"""
if hf_file_exists(repo_id, CONFIG_NAME):
if hf_file_exists(repo_id=repo_id, filename=CONFIG_NAME, subfolder=subfolder):
file_name = CONFIG_NAME
else:
raise EntryNotFoundError(f"can not find the paddle/pytorch config file from: https://huggingface.co/{repo_id}")
Expand All @@ -267,6 +268,7 @@ def resolve_hf_config_path(repo_id: str, cache_dir: str) -> str:
repo_id=repo_id,
filename=file_name,
cache_dir=cache_dir,
subfolder=subfolder,
library_name="PaddleNLP",
library_version=__version__,
)
Expand Down Expand Up @@ -770,6 +772,7 @@ def _get_config_dict(
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
cache_dir = kwargs.pop("cache_dir", None)
from_hf_hub = kwargs.pop("from_hf_hub", False)
subfolder = kwargs.pop("subfolder", None)
cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)

force_download = kwargs.pop("force_download", False)
Expand Down Expand Up @@ -813,8 +816,9 @@ def _get_config_dict(

# 4. get the configuration file from HF hub
elif from_hf_hub:
resolved_config_file = resolve_hf_config_path(repo_id=pretrained_model_name_or_path, cache_dir=cache_dir)

resolved_config_file = resolve_hf_config_path(
repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder
)
else:
community_url = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, CONFIG_NAME)
if url_file_exists(community_url):
Expand Down
20 changes: 19 additions & 1 deletion paddlenlp/transformers/image_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
from typing import Any, Dict, Iterable, Optional, Tuple, Union

import numpy as np
from huggingface_hub import hf_hub_download

from paddlenlp import __version__

from ..utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock
from ..utils.env import MODEL_HOME
from ..utils.env import HF_CACHE_HOME, MODEL_HOME
from ..utils.log import logger
from .feature_extraction_utils import BatchFeature as BaseBatchFeature

Expand Down Expand Up @@ -186,11 +189,16 @@ def get_image_processor_dict(
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
from_hf_hub (bool, optional): whether to load from Huggingface Hub
subfolder (str, optional) An optional value corresponding to a folder inside the repo.


Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
"""
cache_dir = kwargs.pop("cache_dir", None)
from_hf_hub = kwargs.pop("from_hf_hub", False)
subfolder = kwargs.pop("subfolder", None)

pretrained_model_name_or_path = str(pretrained_model_name_or_path)
is_local = os.path.isdir(pretrained_model_name_or_path)
Expand All @@ -199,6 +207,16 @@ def get_image_processor_dict(
elif os.path.isfile(pretrained_model_name_or_path):
resolved_image_processor_file = pretrained_model_name_or_path
is_local = True
elif from_hf_hub:
image_processor_file = IMAGE_PROCESSOR_NAME
resolved_image_processor_file = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=image_processor_file,
cache_dir=HF_CACHE_HOME,
subfolder=subfolder,
library_name="PaddleNLP",
library_version=__version__,
)
else:
# Assuming from community-contributed pretrained models
image_processor_file = COMMUNITY_MODEL_PREFIX + pretrained_model_name_or_path + "/" + IMAGE_PROCESSOR_NAME
Expand Down
35 changes: 27 additions & 8 deletions paddlenlp/transformers/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,30 +146,35 @@ def _find_weight_file_path(
)


def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conversion: bool):
def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conversion: bool, subfolder=None):
"""find the suitable weight file name

Args:
repo_id (str): repo name of huggingface hub
cache_dir (str): cache dir for hf
support_conversion (bool): whether support converting pytorch weight file to paddle weight file
subfolder (str, optional) An optional value corresponding to a folder inside the repo.
"""
if hf_file_exists(repo_id, "model_state.pdparams"):
if hf_file_exists(repo_id, "model_state.pdparams", subfolder=subfolder):
file_name = "model_state.pdparams"
elif hf_file_exists(repo_id, PYTORCH_WEIGHT_FILE_NAME):
elif hf_file_exists(repo_id, PYTORCH_WEIGHT_FILE_NAME, subfolder=subfolder):
if not support_conversion:
raise EntryNotFoundError(
f"can not download `model_state.pdparams from https://huggingface.co/{repo_id}` "
"and current model doesn't support conversion from pytorch weight file to paddle weight file"
)
file_name = PYTORCH_WEIGHT_FILE_NAME
else:
raise EntryNotFoundError(f"can not find the paddle/pytorch weight file from: https://huggingface.co/{repo_id}")
raise EntryNotFoundError(
message=f"can not find the paddle/pytorch weight file from: https://huggingface.co/{repo_id}",
response=None,
)

return hf_hub_download(
repo_id=repo_id,
filename=file_name,
cache_dir=cache_dir,
subfolder=subfolder,
library_name="PaddleNLP",
library_version=__version__,
)
Expand Down Expand Up @@ -425,7 +430,7 @@ def constructed_from_pretrained_config(cls, init_func=None) -> bool:
return cls.config_class is not None and issubclass(cls.config_class, PretrainedConfig)

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, subfolder=None, **kwargs):
"""
Creates an instance of `PretrainedModel`. Model weights are loaded
by specifying name of a built-in pretrained model, or a community contributed model,
Expand All @@ -441,6 +446,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
- Local directory path which contains model weights file("model_state.pdparams")
and model config file ("model_config.json").
from_hf_hub (bool): whether to load from Huggingface Hub
subfolder (str, optional) An optional value corresponding to a folder inside the repo.
Only works when loading from Huggingface Hub.
*args (tuple): Position arguments for model `__init__`. If provided,
use these as position argument values for model initialization.
**kwargs (dict): Keyword arguments for model `__init__`. If provided,
Expand Down Expand Up @@ -475,7 +482,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
model = BertForSequenceClassification.from_pretrained('./my_bert/')
"""
if cls.constructed_from_pretrained_config():
return cls.from_pretrained_v2(pretrained_model_name_or_path, from_hf_hub=from_hf_hub, *args, **kwargs)
return cls.from_pretrained_v2(
pretrained_model_name_or_path, from_hf_hub=from_hf_hub, subfolder=subfolder, *args, **kwargs
)

resource_files = {}
init_configuration = {}
Expand Down Expand Up @@ -523,6 +532,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
resolved_resource_files[file_id] = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=file_path,
subfolder=subfolder,
cache_dir=HF_CACHE_HOME,
library_name="PaddleNLP",
library_version=__version__,
Expand Down Expand Up @@ -951,6 +961,7 @@ def _resolve_model_file_path(
pretrained_model_name_or_path: str,
from_hf_hub: bool = False,
cache_dir: str | None = None,
subfolder: str | None = None,
support_conversion: bool = False,
) -> str:
"""resolve model target file path from `` and `cache_dir`
Expand Down Expand Up @@ -1033,7 +1044,10 @@ def _resolve_model_file_path(
# 4. when it's from HF
if from_hf_hub:
return resolve_weight_file_from_hf_hub(
pretrained_model_name_or_path, cache_dir=HF_CACHE_HOME, support_conversion=support_conversion
pretrained_model_name_or_path,
cache_dir=HF_CACHE_HOME,
support_conversion=support_conversion,
subfolder=subfolder,
)

# 5. download from community or hf-hub
Expand Down Expand Up @@ -1225,7 +1239,9 @@ def _find_mismatched_keys(
return model_to_load, missing_keys, unexpected_keys, mismatched_keys

@classmethod
def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = False, *args, **kwargs):
def from_pretrained_v2(
cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str | None = None, *args, **kwargs
):
"""
Creates an instance of `PretrainedModel`. Model weights are loaded
by specifying name of a built-in pretrained model, a pretrained model from HF Hub, a community contributed model,
Expand All @@ -1241,6 +1257,8 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
- Local directory path which contains model weights file("model_state.pdparams")
and model config file ("model_config.json").
from_hf_hub (bool): load model from huggingface hub. Default to `False`.
subfolder (str, optional) An optional value corresponding to a folder inside the repo.
Only works when loading from Huggingface Hub.
*args (tuple): Position arguments for model `__init__`. If provided,
use these as position argument values for model initialization.
**kwargs (dict): Keyword arguments for model `__init__`. If provided,
Expand Down Expand Up @@ -1308,6 +1326,7 @@ def from_pretrained_v2(cls, pretrained_model_name_or_path, from_hf_hub: bool = F
model_weight_file = cls._resolve_model_file_path(
pretrained_model_name_or_path,
cache_dir=cache_dir,
subfolder=subfolder,
from_hf_hub=from_hf_hub,
support_conversion=support_conversion,
)
Expand Down
6 changes: 5 additions & 1 deletion paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,7 +1399,7 @@ def get_vocab(self) -> Dict[str, int]:
raise NotImplementedError()

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, subfolder=None, **kwargs):
"""
Creates an instance of `PretrainedTokenizer`. Related resources are loaded
by specifying name of a built-in pretrained model, or a community-contributed
Expand All @@ -1413,6 +1413,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
- Name of a community-contributed pretrained model.
- Local directory path which contains tokenizer related resources
and tokenizer config file ("tokenizer_config.json").
from_hf_hub (bool, optional): whether to load from Huggingface Hub
subfolder (str, optional) An optional value corresponding to a folder inside the repo.
Only works when loading from Huggingface Hub.
*args (tuple): position arguments for model `__init__`. If provided,
use these as position argument values for tokenizer initialization.
**kwargs (dict): keyword arguments for model `__init__`. If provided,
Expand Down Expand Up @@ -1486,6 +1489,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False
resolved_vocab_files[file_id] = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=file_path,
subfolder=subfolder,
cache_dir=HF_CACHE_HOME,
library_name="PaddleNLP",
library_version=__version__,
Expand Down
7 changes: 5 additions & 2 deletions paddlenlp/utils/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ def url_file_exists(url: str) -> bool:
return result.status_code == requests.codes.ok


def hf_file_exists(repo_id: str, filename: str, token: Union[bool, str, None] = None) -> bool:
def hf_file_exists(
repo_id: str, filename: str, token: Union[bool, str, None] = None, subfolder: Optional[str] = None
) -> bool:
"""Check whether the HF file exists

Args:
Expand All @@ -477,11 +479,12 @@ def hf_file_exists(repo_id: str, filename: str, token: Union[bool, str, None] =
- If `True`, the token is read from the HuggingFace config folder.
- If `False` or `None`, no token is provided.
- If a string, it's used as the authentication token.
subfolder (str, optional) An optional value corresponding to a folder inside the repo.
Returns:
bool: whether the HF file exists
"""

url = hf_hub_url(repo_id, filename)
url = hf_hub_url(repo_id=repo_id, filename=filename, subfolder=subfolder)
try:
_ = get_hf_file_metadata(
url=url,
Expand Down
Loading