unslothai · danielhanchen · Jan 20, 2025 · Nov 26, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ triton = [
     "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post5/triton-3.1.0-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
 ]
 huggingface = [
-    "unsloth_zoo>=2025.1.2",
+    "unsloth_zoo>=2025.1.4",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -285,7 +285,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.1.2",
+    "unsloth_zoo>=2025.1.4",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -86,6 +86,10 @@
     del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
 pass
 
+# First check if CUDA is available ie a NVIDIA GPU is seen
+if not torch.cuda.is_available():
+    raise NotImplementedError("Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!")
+
 # Fix Xformers performance issues since 0.0.25
 import importlib.util
 from pathlib import Path
@@ -194,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.1.2"):
+    if Version(unsloth_zoo_version) < Version("2025.1.4"):
         try:
             os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
         except:

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.1.5"
+__version__ = "2025.1.6"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
@@ -285,7 +285,11 @@ def _is_openai_available(): return False
     if _is_package_available("flash_attn"):
         # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
         try:
-            from flash_attn.flash_attn_interface import flash_attn_cuda
+            try:
+                # See https://github.com/unslothai/unsloth/issues/1437
+                from flash_attn.flash_attn_interface import flash_attn_gpu
+            except:
+                from flash_attn.flash_attn_interface import flash_attn_cuda
             HAS_FLASH_ATTENTION = True
 
             # Also check for softcapping
@@ -843,7 +847,9 @@ def patch_linear_scaling(
         "self.rotary_emb = .+?\)", function,
         flags = re.DOTALL | re.MULTILINE,
     )
-    if len(rotary_emb) == 0: return None, function
+    if len(rotary_emb) == 0:
+        return None, exec_code + "\n\n" + function
+
     rotary_emb = rotary_emb[0]
     function = function.replace(rotary_emb, fix_rope_function, 1)
     function = exec_code + "\n\n" + function

diff --git a/unsloth/models/granite.py b/unsloth/models/granite.py
@@ -89,6 +89,7 @@ def GraniteAttention_fast_forward(
     n_groups   = self.num_key_value_groups
     n_kv_heads = self.config.num_key_value_heads
     head_dim   = self.head_dim
+    dropout_p  = self.config.attention_dropout if self.training else 0
     assert(n_kv_heads * n_groups == n_heads)
 
     Q, K, V = self.apply_qkv(self, hidden_states)
@@ -135,15 +136,15 @@ def GraniteAttention_fast_forward(
             Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
         pass
 
-        A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling)
+        A = xformers_attention(Q, K, V, attn_bias = causal_mask, scale=self.scaling, p=dropout_p)
         A = A.view(bsz, q_len, n_heads, head_dim)
 
     elif HAS_FLASH_ATTENTION and attention_mask is None:
         Q = Q.transpose(1, 2)
         K = K.transpose(1, 2)
         V = V.transpose(1, 2)
         window = (kv_seq_len, kv_seq_len)
-        A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling)
+        A = flash_attn_func(Q, K, V, causal = True, window_size = window, softmax_scale=self.scaling, dropout_p=dropout_p)
     else:
         # Grouped query attention
         # if n_groups != 1:
@@ -157,7 +158,7 @@ def GraniteAttention_fast_forward(
         Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
         # Needs (batch_size, n_heads, seq_len, head_dim)
         # is_casual and attention_mask must not be both set!
-        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False)
+        A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, scale = self.scaling, is_causal = False, dropout_p=dropout_p)
         # Go back to (batch_size, seq_len, n_heads, head_dim)
         A = A.transpose(1, 2).contiguous()
     pass

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -636,6 +636,7 @@ def LlamaModel_fast_forward(
     IS_GEMMA2  = self.config.model_type.startswith("gemma2")
     IS_COHERE  = self.config.model_type.startswith("cohere")
     IS_GRANITE = self.config.model_type.startswith("granite")
+
     train_embed_tokens = self.embed_tokens.weight.requires_grad
 
     if IS_GEMMA:
@@ -664,7 +665,7 @@ def LlamaModel_fast_forward(
 
     # Fix up attention mask by setting elements to 0
     # Specifically for DPO
-    if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
+    if getattr(self, "_has_no_labels", False) is True and (attention_mask is not None) and (past_key_values is None) and \
         (not train_embed_tokens):
         # Careful for inference the attention_mask is size (1, kv_seq_len)
         # Whilst the input_embeds is size (1, 1, 4096)
@@ -792,9 +793,12 @@ def LlamaModel_fast_forward(
         pass
     pass
 
-    if IS_ATTENTION_REFACTOR and not hasattr(self.layers[0].self_attn, "rotary_emb"):
+    if (IS_ATTENTION_REFACTOR and (hasattr(self, "rotary_emb") or not hasattr(self.layers[0].self_attn, "rotary_emb"))) or IS_GRANITE:
         # Transformers main has made it mandatory to pass position_embeddings
         # https://github.com/huggingface/transformers/pull/34858
+        # Also, transformers 4.45.0 supports granite but with the attention refactor (it always had the refactor)
+        # unsloth's check for granite too has "version >= 4.45.0 (rightly so)".
+        # so let granite always use the attention refactor implementation.
         position_embeddings = self.rotary_emb(hidden_states, position_ids, self.config.max_position_embeddings)
     else:
         position_embeddings = None

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
@@ -471,20 +471,18 @@
         "meta-llama/Llama-3.2-11B-Vision-Instruct",
         "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
     ),
-    "unsloth/Llama-3.2-90B-Vision-Instruct-unsloth-bnb-4bit" : (
+    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit" : (
         "unsloth/Llama-3.2-90B-Vision-Instruct",
         "meta-llama/Llama-3.2-90B-Vision-Instruct",
-        "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
     ),
     "unsloth/Llama-3.2-11B-Vision-unsloth-bnb-4bit" : (
         "unsloth/Llama-3.2-11B-Vision",
         "meta-llama/Llama-3.2-11B-Vision",
         "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
     ),
-    "unsloth/Llama-3.2-90B-Vision-unsloth-bnb-4bit" : (
+    "unsloth/Llama-3.2-90B-Vision-bnb-4bit" : (
         "unsloth/Llama-3.2-90B-Vision",
         "meta-llama/Llama-3.2-90B-Vision",
-        "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
     ),
     "unsloth/Pixtral-12B-2409-unsloth-bnb-4bit" : (
         "unsloth/Pixtral-12B-2409",

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
@@ -304,7 +304,7 @@ def pre_patch():
             attention_module   = MistralAttention,
         )
         # Just for Mistral Nemo models!
-        if function is not None:
+        if function is not None and init_name is not None:
             function = patch_mistral_nemo_attention(function)
             # if True:#init_name is not None:
             exec(function, globals())