diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 26 additions & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 3 additions & 2 deletions b/‎Makefile‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 80 deletions b/‎README.md‎
Lines changed: 14 additions & 80 deletions
diff --git a/‎dev/modal/tests_bwd.py‎
Lines changed: 28 additions & 0 deletions b/‎dev/modal/tests_bwd.py‎
Lines changed: 28 additions & 0 deletions
@@ -61,6 +61,31 @@ jobs:
         python -m pip install --upgrade pip
         pip install modal
 
-    - name: Run unit tests
+    - name: Run tests
       run: |
         modal run dev.modal.tests
+
+  tests-bwd:
+    runs-on: ubuntu-latest
+    needs: [checkstyle]
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install modal
+
+    - name: Run tests
+      run: |
+        modal run dev.modal.tests_bwd
@@ -20,8 +20,9 @@ checkstyle:
 # Command to run pytest for convergence tests
 # We have to explicitly set HF_DATASETS_OFFLINE=1, or dataset will silently try to send metrics and timeout (80s) https://github.com/huggingface/datasets/blob/37a603679f451826cfafd8aae00738b01dcb9d58/src/datasets/load.py#L286
 test-convergence:
-	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence
-
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/test_mini_models.py
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/test_mini_models_multimodal.py
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/test_mini_models_with_logits.py
 
 # Command to run all benchmark scripts and update benchmarking data file
 # By default this doesn't overwrite existing data for the same benchmark experiment
 
@@ -52,7 +52,8 @@
 
 <details>
   <summary>Latest News 🔥</summary>
-    
+
+  - [2024/11/6] We release [v0.4.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.4.0): Full AMD support, Tech Report, Modal CI, Llama-3.2-Vision!
   - [2024/10/21] We have released the tech report of Liger Kernel on Arxiv: https://arxiv.org/pdf/2410.10989 
   - [2024/9/6] We release v0.2.1 ([X post](https://x.com/liger_kernel/status/1832168197002510649)). 2500+ Stars, 10+ New Contributors, 50+ PRs, 50k Downloads in two weeks!
   - [2024/8/31] CUDA MODE talk, [Liger-Kernel: Real-world Triton kernel for LLM Training](https://youtu.be/gWble4FreV4?si=dxPeIchhkJ36Mbns), [Slides](https://github.com/cuda-mode/lectures?tab=readme-ov-file#lecture-28-liger-kernel)
@@ -80,18 +81,12 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
 
 ## Examples
 
-### Basic
-
-| **Example**                                    | **Description**                                                                                   | **Lightning Studio** |
-|------------------------------------------------|---------------------------------------------------------------------------------------------------|----------------------|
-| [**Hugging Face Trainer**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/huggingface)      | Train LLaMA 3-8B ~20% faster with over 40% memory reduction on Alpaca dataset using 4 A100s with FSDP | TBA                  |
-| [**Lightning Trainer**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/lightning)         | Increase 15% throughput and reduce memory usage by 40% with LLaMA3-8B on MMLU dataset using 8 A100s with DeepSpeed ZeRO3 | TBA                  |
 
-### Advanced
-
-| **Example**                                    | **Description**                                                                                   | **Lightning Studio** |
-|------------------------------------------------|---------------------------------------------------------------------------------------------------|----------------------|
-| [**Medusa Multi-head LLM (Retraining Phase)**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa)        | Reduce memory usage by 80% with 5 LM heads and improve throughput by 40% using 8 A100s with FSDP      | TBA                  |
+| **Use Case**                                    | **Description**                                                                                   |
+|------------------------------------------------|---------------------------------------------------------------------------------------------------|
+| [**Hugging Face Trainer**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/huggingface)      | Train LLaMA 3-8B ~20% faster with over 40% memory reduction on Alpaca dataset using 4 A100s with FSDP |
+| [**Lightning Trainer**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/lightning)         | Increase 15% throughput and reduce memory usage by 40% with LLaMA3-8B on MMLU dataset using 8 A100s with DeepSpeed ZeRO3 |
+| [**Medusa Multi-head LLM (Retraining Phase)**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa)        | Reduce memory usage by 80% with 5 LM heads and improve throughput by 40% using 8 A100s with FSDP      |               |
 
 ## Key Features
 
@@ -102,13 +97,6 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
 - **Multi-GPU supported:** Compatible with multi-GPU setups (PyTorch FSDP, DeepSpeed, DDP, etc.).
 - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift)
 
-## Target Audiences
-
-- **Researchers**: Looking to compose models using efficient and reliable kernels for frontier experiments.
-- **ML Practitioners**: Focused on maximizing GPU training efficiency with optimal, high-performance kernels.
-- **Curious Novices**: Eager to learn how to write reliable Triton kernels to enhance training efficiency.
-
-
 ## Installation
 
 ### Dependencies 
@@ -214,23 +202,6 @@ loss = loss_fn(model.weight, input, target)
 loss.backward()
 ```
 
-
-## Structure
-
-### Source Code
-
-- `ops/`: Core Triton operations.
-- `transformers/`: PyTorch `nn.Module` implementations built on Triton operations, compliant with the `transformers` API.
-
-### Tests
-
-- `transformers/`: Correctness tests for the Triton-based layers.
-- `convergence/`: Patches Hugging Face models with all kernels, runs multiple iterations, and compares weights, logits, and loss layer-by-layer.
-
-### Benchmark
-
-- `benchmark/`: Execution time and memory benchmarks compared to Hugging Face layers.
-
 ## APIs
 
 ### AutoModel
@@ -249,7 +220,7 @@ loss.backward()
 | Mistral     | `liger_kernel.transformers.apply_liger_kernel_to_mistral`  | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Mixtral     | `liger_kernel.transformers.apply_liger_kernel_to_mixtral`  | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Gemma1      | `liger_kernel.transformers.apply_liger_kernel_to_gemma`    | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
-| Gemma2      | `liger_kernel.transformers.apply_liger_kernel_to_gemma2`   | RoPE, RMSNorm, GeGLU, CrossEntropyLoss         |
+| Gemma2      | `liger_kernel.transformers.apply_liger_kernel_to_gemma2`   | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
 | Qwen2 & Qwen2.5      | `liger_kernel.transformers.apply_liger_kernel_to_qwen2`    | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Qwen2-VL       | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl`    | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Phi3 & Phi3.5       | `liger_kernel.transformers.apply_liger_kernel_to_phi3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
@@ -299,54 +270,17 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
 - **Embedding**: [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) is implemented by fusing embedding lookup and output operations. It achieves a peak speedup of ~1.5x in the forward pass and an overall speedup of ~1.1x.
 - **Matmul int2xint8**: is implemented by using the cache tiled matrix multiplication and by fusing the matmul with the unpacking process which achieves a considerable speed up and performs on par with @torch.compile
 <!-- TODO: be more specific about batch size -->
-> **Note:**
-> Reported speedups and memory reductions are with respect to the LLaMA 3-8B Hugging Face layer implementations. All models use 4K hidden size and 4K sequence length and are evaluated based on memory usage and wall time for the forward+backward pass on a single NVIDIA A100 80G GPU using small batch sizes. Liger kernels exhibit more efficient scaling to larger batch sizes, detailed further in the [Benchmark](./benchmark) folder.
-
-## Contributing
-
-[CONTRIBUTING GUIDE](https://github.com/linkedin/Liger-Kernel/blob/main/CONTRIBUTING.md)
-
-## Acknowledgement
-
-
-### Design
-
-- [@claire_yishan](https://twitter.com/claire_yishan) for the LOGO design
-- [Wave Snippets](https://www.wavesnippets.com/) for generating the animated code snippets
-
-### Code
-
-We referenced or used the following projects:
-
-
-
-| # | Project                                                                                      | Description                                                                             | Location                                                                                                                         | License                                                                              |
-|---|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|
-| 1 | [Unsloth](https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/utils.py#L43)                              | `calculate_settings` to determine block size and warp; We reuse it for Norm and MLP     | [Liger Kernel Utils](https://github.com/linkedin/Liger-Kernel/blob/e249eee723978bf8610ff1ea2297d048a2417e20/src/liger_kernel/ops/utils.py#L23) | [Apache](https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/LICENSE) |
-| 2 | [Unsloth](https://github.com/unslothai/unsloth/blob/976d11a10d54383aeb7a692c69e01151a20bfd72/unsloth/kernels/rms_layernorm.py#L48)                              | We modified and added dW calculation on top of Unsloth implementation                   | [Liger Kernel RMS Norm](https://github.com/linkedin/Liger-Kernel/blob/e249eee723978bf8610ff1ea2297d048a2417e20/src/liger_kernel/ops/rms_norm.py#L50)  | [Apache](https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/LICENSE) |
-| 3 | [Triton tutorial](https://triton-lang.org/main/index.html)                                    | We modified on top of triton tutorials                                                  | [Liger Kernel RMS Norm](https://github.com/linkedin/Liger-Kernel/blob/e249eee723978bf8610ff1ea2297d048a2417e20/src/liger_kernel/ops/rms_norm.py#L50)  | [MIT](https://github.com/triton-lang/triton/blob/main/LICENSE)                                  |
-| 4 | [tiny shakespeare dataset](https://huggingface.co/datasets/karpathy/tiny_shakespeare)         | We use tiny shakespeare dataset to conduct convergence test on mini model               | [Liger Kernel Convergence](https://github.com/linkedin/Liger-Kernel/tree/main/test/convergence)                                  | N/A                                                                                   |
-| 5 | [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy)                 | We use the idea of gradient-in-forward and chunking                                    | [Liger Kernel Linear Cross Entropy](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/fused_linear_cross_entropy.py)          | [MIT](https://github.com/mgmalek/efficient_cross_entropy/blob/main/LICENSE)            |
-| 6 | [Flash attn](https://github.com/Dao-AILab/flash-attention)                                    | We take many optimization ideas from the work, such as tiling and recomputation         |                                                                                                                                  | [BSD](https://github.com/Dao-AILab/flash-attention/blob/main/LICENSE)                  |
-| 7 | [AutoAWQ](https://github.com/casper-hansen/AutoAWQ)                                           | We reference the design of automodel                                                   | [Liger Kernel Auto Model](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/auto_model.py)        | [MIT](https://github.com/casper-hansen/AutoAWQ/blob/main/LICENSE)                      |
-| 8 | [llm.c](https://github.com/karpathy/llm.c)                                                    | We reference the design of end-to-end testing                                          | [Liger Kernel Convergence Tests](https://github.com/linkedin/Liger-Kernel/tree/main/test/convergence)                            | [MIT](https://github.com/karpathy/llm.c/blob/master/LICENSE)                           |
-
-Many thanks to the contributors to these projects for their invaluable work that helped make Liger possible.
-
-## License
 
-This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
-It also includes components from projects licensed under:
+## Contributing, Acknowledgements, and License
 
-- Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
-- MIT License (see `LICENSE-MIT-AutoAWQ` for details).
-- MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
-- MIT License (see `LICENSE-MIT-llmc` for details).
-- MIT License (see `LICENSE-MIT-triton` for details).
+- [Contributing Guidelines](https://github.com/linkedin/Liger-Kernel/blob/main/docs/CONTRIBUTING.md)
+- [Acknowledgements](https://github.com/linkedin/Liger-Kernel/blob/main/docs/Acknowledgement.md)
+- [License Information](https://github.com/linkedin/Liger-Kernel/blob/main/docs/License.md)
 
 ## Contact
 
-- For public discussion, join [our discord channel](https://discord.gg/vNBDpjhb)
+- For issues, create a Github ticket in this repository
+- For open discussion, join [our discord channel](https://discord.gg/gpumode)
 - For formal collaboration, send an email to [email protected]
 
 ## Cite this work
 
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parent.parent.parent
+
+# tests_bwd is to ensure the backward compatibility of liger with older transformers
+image = (
+    modal.Image.debian_slim()
+    .pip_install_from_pyproject(
+        ROOT_PATH / "pyproject.toml", optional_dependencies=["dev"]
+    )
+    .pip_install("transformers==4.44.2")
+)
+
+app = modal.App("liger_tests", image=image)
+
+# mount: add local files to the remote container
+repo = modal.Mount.from_local_dir(ROOT_PATH, remote_path="/root/liger-kernel")
+
+
+@app.function(gpu="A10G", mounts=[repo], timeout=60 * 10)
+def liger_tests():
+    import subprocess
+
+    subprocess.run(["pip", "install", "-e", "."], check=True, cwd="/root/liger-kernel")
+    subprocess.run(["make", "test"], check=True, cwd="/root/liger-kernel")
+    subprocess.run(["make", "test-convergence"], check=True, cwd="/root/liger-kernel")