Skip to content

Commit 01db72a

Browse files
authored
Rewrite push_to_hub to use upload_files (#18366)
* Rewrite push_to_hub to use upload_files * Adapt the doc a bit * Address review comments and clean doc
1 parent 3909d7f commit 01db72a

18 files changed

+552
-524
lines changed

docs/source/en/add_new_model.mdx

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -813,13 +813,9 @@ checkpoint and to get the required access rights to be able to upload the model
813813
*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below:
814814

815815
```python
816-
brand_new_bert.push_to_hub(
817-
repo_path_or_name="brand_new_bert",
818-
# Uncomment the following line to push to an organization
819-
# organization="<ORGANIZATION>",
820-
commit_message="Add model",
821-
use_temp_dir=True,
822-
)
816+
brand_new_bert.push_to_hub("brand_new_bert")
817+
# Uncomment the following line to push to an organization.
818+
# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
823819
```
824820

825821
It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the

docs/source/en/model_sharing.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,10 +179,10 @@ This creates a repository under your username with the model name `my-awesome-mo
179179
>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
180180
```
181181

182-
If you belong to an organization and want to push your model under the organization name instead, add the `organization` parameter:
182+
If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`:
183183

184184
```py
185-
>>> pt_model.push_to_hub("my-awesome-model", organization="my-awesome-org")
185+
>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
186186
```
187187

188188
The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository:

src/transformers/configuration_utils.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -417,27 +417,22 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
417417
save_directory (`str` or `os.PathLike`):
418418
Directory where the configuration JSON file will be saved (will be created if it does not exist).
419419
push_to_hub (`bool`, *optional*, defaults to `False`):
420-
Whether or not to push your model to the Hugging Face model hub after saving it.
421-
422-
<Tip warning={true}>
423-
424-
Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
425-
which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
426-
folder. Pass along `temp_dir=True` to use a temporary directory instead.
427-
428-
</Tip>
429-
420+
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
421+
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
422+
namespace).
430423
kwargs:
431424
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
432425
"""
433426
if os.path.isfile(save_directory):
434427
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
435428

429+
os.makedirs(save_directory, exist_ok=True)
430+
436431
if push_to_hub:
437432
commit_message = kwargs.pop("commit_message", None)
438-
repo = self._create_or_get_repo(save_directory, **kwargs)
439-
440-
os.makedirs(save_directory, exist_ok=True)
433+
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
434+
repo_id, token = self._create_repo(repo_id, **kwargs)
435+
files_timestamps = self._get_files_timestamps(save_directory)
441436

442437
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
443438
# loaded from the Hub.
@@ -451,8 +446,9 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
451446
logger.info(f"Configuration saved in {output_config_file}")
452447

453448
if push_to_hub:
454-
url = self._push_to_hub(repo, commit_message=commit_message)
455-
logger.info(f"Configuration pushed to the hub in this commit: {url}")
449+
self._upload_modified_files(
450+
save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
451+
)
456452

457453
@classmethod
458454
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":

src/transformers/feature_extraction_utils.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -318,41 +318,38 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
318318
save_directory (`str` or `os.PathLike`):
319319
Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
320320
push_to_hub (`bool`, *optional*, defaults to `False`):
321-
Whether or not to push your feature extractor to the Hugging Face model hub after saving it.
322-
323-
<Tip warning={true}>
324-
325-
Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
326-
which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
327-
folder. Pass along `temp_dir=True` to use a temporary directory instead.
328-
329-
</Tip>
330-
321+
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
322+
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
323+
namespace).
331324
kwargs:
332325
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
333326
"""
334327
if os.path.isfile(save_directory):
335328
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
336329

330+
os.makedirs(save_directory, exist_ok=True)
331+
337332
if push_to_hub:
338333
commit_message = kwargs.pop("commit_message", None)
339-
repo = self._create_or_get_repo(save_directory, **kwargs)
334+
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
335+
repo_id, token = self._create_repo(repo_id, **kwargs)
336+
files_timestamps = self._get_files_timestamps(save_directory)
340337

341338
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
342339
# loaded from the Hub.
343340
if self._auto_class is not None:
344341
custom_object_save(self, save_directory, config=self)
345342

346-
os.makedirs(save_directory, exist_ok=True)
347343
# If we save using the predefined names, we can load using `from_pretrained`
348344
output_feature_extractor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
349345

350346
self.to_json_file(output_feature_extractor_file)
351347
logger.info(f"Feature extractor saved in {output_feature_extractor_file}")
352348

353349
if push_to_hub:
354-
url = self._push_to_hub(repo, commit_message=commit_message)
355-
logger.info(f"Feature extractor pushed to the hub in this commit: {url}")
350+
self._upload_modified_files(
351+
save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
352+
)
356353

357354
return [output_feature_extractor_file]
358355

src/transformers/modeling_flax_utils.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -941,16 +941,9 @@ def save_pretrained(
941941
save_directory (`str` or `os.PathLike`):
942942
Directory to which to save. Will be created if it doesn't exist.
943943
push_to_hub (`bool`, *optional*, defaults to `False`):
944-
Whether or not to push your model to the Hugging Face model hub after saving it.
945-
946-
<Tip warning={true}>
947-
948-
Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
949-
which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
950-
folder. Pass along `temp_dir=True` to use a temporary directory instead.
951-
952-
</Tip>
953-
944+
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
945+
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
946+
namespace).
954947
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
955948
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
956949
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
@@ -969,11 +962,13 @@ def save_pretrained(
969962
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
970963
return
971964

965+
os.makedirs(save_directory, exist_ok=True)
966+
972967
if push_to_hub:
973968
commit_message = kwargs.pop("commit_message", None)
974-
repo = self._create_or_get_repo(save_directory, **kwargs)
975-
976-
os.makedirs(save_directory, exist_ok=True)
969+
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
970+
repo_id, token = self._create_repo(repo_id, **kwargs)
971+
files_timestamps = self._get_files_timestamps(save_directory)
977972

978973
# get abs dir
979974
save_directory = os.path.abspath(save_directory)
@@ -1028,8 +1023,9 @@ def save_pretrained(
10281023
logger.info(f"Model weights saved in {output_model_file}")
10291024

10301025
if push_to_hub:
1031-
url = self._push_to_hub(repo, commit_message=commit_message)
1032-
logger.info(f"Model pushed to the hub in this commit: {url}")
1026+
self._upload_modified_files(
1027+
save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
1028+
)
10331029

10341030
@classmethod
10351031
def register_for_auto_class(cls, auto_class="FlaxAutoModel"):

src/transformers/modeling_tf_utils.py

Lines changed: 105 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import re
2525
import warnings
2626
from collections.abc import Mapping
27+
from pathlib import Path
2728
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
2829

2930
import h5py
@@ -58,14 +59,14 @@
5859
RepositoryNotFoundError,
5960
RevisionNotFoundError,
6061
cached_path,
61-
copy_func,
6262
find_labels,
6363
has_file,
6464
hf_bucket_url,
6565
is_offline_mode,
6666
is_remote_url,
6767
logging,
6868
requires_backends,
69+
working_or_temp_dir,
6970
)
7071

7172

@@ -1919,6 +1920,7 @@ def save_pretrained(
19191920
version=1,
19201921
push_to_hub=False,
19211922
max_shard_size: Union[int, str] = "10GB",
1923+
create_pr: bool = False,
19221924
**kwargs
19231925
):
19241926
"""
@@ -1935,16 +1937,9 @@ def save_pretrained(
19351937
TensorFlow Serving as detailed in the official documentation
19361938
https://www.tensorflow.org/tfx/serving/serving_basic
19371939
push_to_hub (`bool`, *optional*, defaults to `False`):
1938-
Whether or not to push your model to the Hugging Face model hub after saving it.
1939-
1940-
<Tip warning={true}>
1941-
1942-
Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
1943-
which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
1944-
folder. Pass along `temp_dir=True` to use a temporary directory instead.
1945-
1946-
</Tip>
1947-
1940+
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
1941+
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
1942+
namespace).
19481943
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
19491944
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
19501945
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
@@ -1956,18 +1951,23 @@ def save_pretrained(
19561951
19571952
</Tip>
19581953
1954+
create_pr (`bool`, *optional*, defaults to `False`):
1955+
Whether or not to create a PR with the uploaded files or directly commit.
1956+
19591957
kwargs:
19601958
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
19611959
"""
19621960
if os.path.isfile(save_directory):
19631961
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
19641962
return
19651963

1964+
os.makedirs(save_directory, exist_ok=True)
1965+
19661966
if push_to_hub:
19671967
commit_message = kwargs.pop("commit_message", None)
1968-
repo = self._create_or_get_repo(save_directory, **kwargs)
1969-
1970-
os.makedirs(save_directory, exist_ok=True)
1968+
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
1969+
repo_id, token = self._create_repo(repo_id, **kwargs)
1970+
files_timestamps = self._get_files_timestamps(save_directory)
19711971

19721972
if saved_model:
19731973
saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
@@ -2030,8 +2030,9 @@ def save_pretrained(
20302030
param_dset[:] = layer.numpy()
20312031

20322032
if push_to_hub:
2033-
url = self._push_to_hub(repo, commit_message=commit_message)
2034-
logger.info(f"Model pushed to the hub in this commit: {url}")
2033+
self._upload_modified_files(
2034+
save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
2035+
)
20352036

20362037
@classmethod
20372038
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -2475,12 +2476,95 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
24752476

24762477
return model
24772478

2479+
def push_to_hub(
2480+
self,
2481+
repo_id: str,
2482+
use_temp_dir: Optional[bool] = None,
2483+
commit_message: Optional[str] = None,
2484+
private: Optional[bool] = None,
2485+
use_auth_token: Optional[Union[bool, str]] = None,
2486+
max_shard_size: Optional[Union[int, str]] = "10GB",
2487+
**model_card_kwargs
2488+
) -> str:
2489+
"""
2490+
Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`.
24782491
2479-
# To update the docstring, we need to copy the method, otherwise we change the original docstring.
2480-
TFPreTrainedModel.push_to_hub = copy_func(TFPreTrainedModel.push_to_hub)
2481-
TFPreTrainedModel.push_to_hub.__doc__ = TFPreTrainedModel.push_to_hub.__doc__.format(
2482-
object="model", object_class="TFAutoModel", object_files="model checkpoint"
2483-
)
2492+
Parameters:
2493+
repo_id (`str`):
2494+
The name of the repository you want to push your model to. It should contain your organization name
2495+
when pushing to a given organization.
2496+
use_temp_dir (`bool`, *optional*):
2497+
Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.
2498+
Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.
2499+
commit_message (`str`, *optional*):
2500+
Message to commit while pushing. Will default to `"Upload model"`.
2501+
private (`bool`, *optional*):
2502+
Whether or not the repository created should be private (requires a paying subscription).
2503+
use_auth_token (`bool` or `str`, *optional*):
2504+
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
2505+
when running `transformers-cli login` (stored in `~/.huggingface`). Will default to `True` if
2506+
`repo_url` is not specified.
2507+
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
2508+
Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
2509+
will then be each of size lower than this size. If expressed as a string, needs to be digits followed
2510+
by a unit (like `"5MB"`).
2511+
model_card_kwargs:
2512+
Additional keyword arguments passed along to the [`~TFPreTrainedModel.create_model_card`] method.
2513+
2514+
Examples:
2515+
2516+
```python
2517+
from transformers import TFAutoModel
2518+
2519+
model = TFAutoModel.from_pretrained("bert-base-cased")
2520+
2521+
# Push the model to your namespace with the name "my-finetuned-bert".
2522+
model.push_to_hub("my-finetuned-bert")
2523+
2524+
# Push the model to an organization with the name "my-finetuned-bert".
2525+
model.push_to_hub("huggingface/my-finetuned-bert")
2526+
```
2527+
"""
2528+
if "repo_path_or_name" in model_card_kwargs:
2529+
warnings.warn(
2530+
"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
2531+
"`repo_id` instead."
2532+
)
2533+
repo_id = model_card_kwargs.pop("repo_path_or_name")
2534+
# Deprecation warning will be sent after for repo_url and organization
2535+
repo_url = model_card_kwargs.pop("repo_url", None)
2536+
organization = model_card_kwargs.pop("organization", None)
2537+
2538+
if os.path.isdir(repo_id):
2539+
working_dir = repo_id
2540+
repo_id = repo_id.split(os.path.sep)[-1]
2541+
else:
2542+
working_dir = repo_id.split("/")[-1]
2543+
2544+
repo_id, token = self._create_repo(
2545+
repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
2546+
)
2547+
2548+
if use_temp_dir is None:
2549+
use_temp_dir = not os.path.isdir(working_dir)
2550+
2551+
with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:
2552+
files_timestamps = self._get_files_timestamps(work_dir)
2553+
2554+
# Save all files.
2555+
self.save_pretrained(work_dir, max_shard_size=max_shard_size)
2556+
if hasattr(self, "history") and hasattr(self, "create_model_card"):
2557+
# This is a Keras model and we might be able to fish out its History and make a model card out of it
2558+
base_model_card_args = {
2559+
"output_dir": work_dir,
2560+
"model_name": Path(repo_id).name,
2561+
}
2562+
base_model_card_args.update(model_card_kwargs)
2563+
self.create_model_card(**base_model_card_args)
2564+
2565+
self._upload_modified_files(
2566+
work_dir, repo_id, files_timestamps, commit_message=commit_message, token=token
2567+
)
24842568

24852569

24862570
class TFConv1D(tf.keras.layers.Layer):

0 commit comments

Comments
 (0)