datquocnguyen · datquocnguyen · Aug 18, 2022 · Aug 14, 2022 · Aug 16, 2022 · Aug 16, 2022
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
@@ -111,9 +111,24 @@ jobs:
           echo "::set-output name=matrix::$keys"
           echo "::set-output name=test_map::$test_map"
 
+  run_check_runners:
+    name: Check Runners
+    needs: setup
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
   run_tests_single_gpu:
     name: Model tests
-    needs: setup
+    needs: [setup, run_check_runners]
     # `dummy` means there is no test to run
     if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
     strategy:
@@ -198,7 +213,7 @@ jobs:
 
   run_tests_multi_gpu:
     name: Model tests
-    needs: setup
+    needs: [setup, run_check_runners]
     # `dummy` means there is no test to run
     if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
     strategy:
@@ -285,7 +300,7 @@ jobs:
 
   run_tests_torch_cuda_extensions_single_gpu:
     name: Torch CUDA extension tests
-    needs: setup
+    needs: [setup, run_check_runners]
     if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
     strategy:
       fail-fast: false
@@ -364,7 +379,7 @@ jobs:
 
   run_tests_torch_cuda_extensions_multi_gpu:
     name: Torch CUDA extension tests
-    needs: setup
+    needs: [setup, run_check_runners]
     if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
     strategy:
       fail-fast: false
@@ -447,12 +462,20 @@ jobs:
     if: always()
     needs: [
         setup,
+        run_check_runners,
         run_tests_single_gpu,
         run_tests_multi_gpu,
         run_tests_torch_cuda_extensions_single_gpu,
         run_tests_torch_cuda_extensions_multi_gpu
     ]
     steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Setup status: ${{ needs.setup.result }}"
+          echo "Runner status: ${{ needs.run_check_runners.result }}"
+
       # Necessary to get the correct branch name and commit SHA for `workflow_run` event
       # We also take into account the `push` event (we might want to test some changes in a branch)
       - name: Prepare custom environment variables
@@ -498,6 +521,9 @@ jobs:
           CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
           CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
           CI_SHA: ${{ env.CI_SHA }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
+
         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='1.12.0'
+ARG PYTORCH='1.12.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='1.11.0'
 # Example: `cu102`, `cu113`, etc.
@@ -46,7 +46,7 @@ RUN python3 -m pip install -U "itsdangerous<2.1.0"
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
 # Add bitsandbytes for mixed int8 testing
-RUN python3 -m pip install -i https://test.pypi.org/simple/ bitsandbytes==0.31.5
+RUN python3 -m pip install --no-cache-dir bitsandbytes
 
 RUN python3 -m pip install --no-cache-dir decord
 

diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -3,7 +3,7 @@ LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
-ARG PYTORCH='1.12.0'
+ARG PYTORCH='1.12.1'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu113'
 

diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
@@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing]
 
 # If set to nothing, will install the latest version
-ARG PYTORCH='1.12.0'
+ARG PYTORCH='1.12.1'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 

diff --git a/docs/source/en/main_classes/model.mdx b/docs/source/en/main_classes/model.mdx
@@ -133,46 +133,6 @@ model = AutoModel.from_config(config)
 
 Due to Pytorch design, this functionality is only available for floating dtypes.
 
-### `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
-
-From the paper `GPT3.int8() : 8-bit Matrix Multiplication for Transformers at Scale`, we suport HuggingFace 🤗  integration for all models in the Hub with few lines of code. 
-For models trained in  half-precision (aka, either `float16` or `bfloat16`) or full precision. This method aims to reduce `nn.Linear` size by 2 (if trained in half precision) or by 4 if trained in full precision, without affecting too much quality by operating on the outliers in half-precision.
-This technique is useful and works well for billion scale models (>1B parameters) therefore we advice you to use it only for models of that scale. This method has been tested for 2-billion to 176-billion scale models and supports only PyTorch models. 
-
-![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
-
-Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models (>=176B parameters).
-Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models (small models, fine-tuning).
-
-Note also that you would require a GPU to run mixed-8bit models as the kernels has been compiled for GPUs only. Make sure that you have enough GPU RAM to store the quarter (or half if your model is natively in half precision) of the model before using this feature. 
-
-Below are some notes to help you use this module, or follow this demo on Google colab: [![Open In Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
-
-#### Requirements
-
-- Make sure you run that on a NVIDIA GPU that supports 8-bit tensor cores (Turing or Ampere GPUs - e.g. T4, RTX20s RTX30s, A40-A100). Note that previous generations of NVIDIA GPUs do not support 8-bit tensor cores.
-- Install the correct version of `bitsandbytes` by running:
-`pip install -i https://test.pypi.org/simple/ bitsandbytes`
-- Install `accelerate`:
-`pip install accelerate`
-
-#### Running mixed-int8 models
-
-After carefully installing the required libraries, the way to load your mixed 8-bit model is as follows:
-```py
-model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
-```
-The implementation supports multi-GPU setup thanks to `accelerate` as backend. If you want to control the GPU memory you want to allocate for each GPU, you can use the `max_memory` argument as follows:
-(If allocating `1GB` into GPU-0 and `2GB` into GPU-1, you can use `max_memory={0:"1GB", 1:"2GB"}`)
-```py
-max_memory_mapping = {0: "1GB", 1: "2GB"}
-model_name = "bigscience/bloom-3b"
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
-)
-```
-
 
 ## ModuleUtilsMixin
 

diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
@@ -591,6 +591,66 @@ More details in this [issues](https://github.com/pytorch/pytorch/issues/75676).
 More details mentioned in this [issue](https://github.com/pytorch/pytorch/issues/76501)
 (`The original model parameters' .grads are not set, meaning that they cannot be optimized separately (which is why we cannot support multiple parameter groups)`).
 
+### Using Trainer for accelerated PyTorch Training on Mac 
+
+With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 
+This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
+Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 
+This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
+For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
+and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html). 
+
+<Tip warning={false}>
+
+We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. 
+It has major fixes related to model correctness and performance improvements for transformer based models.
+Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
+
+</Tip>
+
+**Benefits of Training and Inference using Apple Silicon Chips**
+
+1. Enables users to train larger networks or batch sizes locally
+2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
+Therefore, improving end-to-end performance.
+3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
+
+**Pre-requisites**: To install torch with mps support, 
+please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
+
+**Usage**:
+User has to just pass `--use_mps_device` argument. 
+For example, you can run the offical Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
+
+```bash
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --use_mps_device \
+  --overwrite_output_dir
+```
+
+**A few caveats to be aware of**
+
+1. Some PyTorch operations have not been implemented in mps and will throw an error. 
+One way to get around that is to set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1`, 
+which will fallback to CPU for these operations. It still throws a UserWarning however.
+2. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
+This means that currently only single GPU of `mps` device type can be used.
+
+Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, therefore if you
+have any problems or questions with regards to MPS backend usage, please, 
+file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
+
 Sections that were moved:
 
 [ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>

diff --git a/docs/source/en/model_doc/longt5.mdx b/docs/source/en/model_doc/longt5.mdx
@@ -37,7 +37,7 @@ Tips:
 - [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional
 encoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.
 - Unlike the T5 model, LongT5 does not use a task prefix. Furthermore, it uses a different pre-training objective
-inspired by the pre-training of `[PegasusForConditionalGeneration]`.
+inspired by the pre-training of [`PegasusForConditionalGeneration`].
 - LongT5 model is designed to work efficiently and very well on long-range *sequence-to-sequence* tasks where the
 input sequence exceeds commonly used 512 tokens. It is capable of handling input sequences of a length up to 16,384 tokens.
 - For *Local Attention*, the sparse sliding-window local attention operation allows a given token to attend only `r`

diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
@@ -733,3 +733,56 @@ This feature involves 3 different libraries. To install them, please follow the
 - [Torchdynamo installation](https://github.com/pytorch/torchdynamo#requirements-and-setup)  
 - [Functorch installation](https://github.com/pytorch/functorch#install)  
 - [Torch-TensorRT(FX) installation](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst#installation)  
+
+## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
+
+From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support HuggingFace integration for all models in the Hub with a few lines of code. 
+The method reduce `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision.
+
+![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models.
+For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration).
+
+![MixedInt8.gif](https://s3.amazonaws.com/moonup/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
+
+Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature. 
+Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
+
+### Requirements
+
+- Make sure you run that on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100).
+- Install the correct version of `bitsandbytes` by running:
+`pip install bitsandbytes>=0.31.5`
+- Install `accelerate`
+`pip install accelerate>=0.12.0`
+
+### Running mixed-int8 models
+
+After installing the required libraries, the way to load your mixed 8-bit model is as follows:
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+The current implementation supports a multi-GPU setup when using `accelerate`. If you want to control the GPU memory you want to allocate for each GPU use the `max_memory` argument as follows:
+
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
+
+In this example, the first GPU will use 1GB of memory and the second 2GB.
+
+### Colab demos
+
+With this method you can infer on models that were not possible to infer on a Google Colab before. 
+Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
+
+[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+
+Or this demo for BLOOM-3B:
+
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
@@ -107,6 +107,12 @@ class TrainingArguments:
         default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
     )
     hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
 
     def __post_init__(self):
         if self.output_dir is not None:
@@ -640,6 +646,9 @@ def group_texts(examples):
             dtype=getattr(jnp, model_args.dtype),
         )
 
+    if training_args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()

diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
@@ -121,6 +121,12 @@ class TrainingArguments:
         default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
     )
     hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
 
     def __post_init__(self):
         if self.output_dir is not None:
@@ -535,6 +541,9 @@ def main():
             dtype=getattr(jnp, model_args.dtype),
         )
 
+    if training_args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+
     if model.config.decoder_start_token_id is None:
         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
 

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -464,7 +464,7 @@ def group_texts(examples):
 
     # Optimizer
     # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
+    no_decay = ["bias", "layer_norm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -558,10 +558,15 @@ def group_texts(examples):
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
@@ -570,7 +575,9 @@ def group_texts(examples):
             # We need to skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == starting_epoch:
                 if resume_step is not None and step < resume_step:
-                    completed_steps += 1
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        completed_steps += 1
                     continue
 
             with accelerator.accumulate(model):