precommit

Quentin-Anthony · Quentin-Anthony · commit 7c1aa9122ba9 · 2025-05-09T11:40:02.000-04:00
diff --git a/tools/ckpts/convert_hf_llama_to_neox.py b/tools/ckpts/convert_hf_llama_to_neox.py
@@ -62,11 +62,7 @@ def convert_model(hf_state_dict, hf_config, tp_ranks):
             # The GQA code simply expects concatenated q,k,v weights for each tp partition
             conv_state_dicts[i][
                 f"sequential.{layer_num+2}.attention.query_key_value.weight"
-            ] = (
-                torch.cat([q_chunk, k_chunk, v_chunk], dim=0)
-                .clone()
-                .detach()
-            )
+            ] = (torch.cat([q_chunk, k_chunk, v_chunk], dim=0).clone().detach())
         print(
             f"model.layers.{layer_num}.self_attn.(q/k/v)_proj.weight",
             hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"].shape,
diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
@@ -371,12 +371,16 @@ def reshard_and_split_qkv(
         )
         # We should now have shape [TP_SIZE, (hidden_size + 2 * kv_hidden_size) / TP_SIZE, hidden_size].
         # At this point, for each TP rank, q, k, and v are concatenated
-        
+
         # Next, we split tp_harded_qkv into q, k, v along dim 1
-        hidden_size_per_attention_head = hf_config.hidden_size // hf_config.num_attention_heads
-        kv_hidden_size = int(hidden_size_per_attention_head * hf_config.num_key_value_heads)
+        hidden_size_per_attention_head = (
+            hf_config.hidden_size // hf_config.num_attention_heads
+        )
+        kv_hidden_size = int(
+            hidden_size_per_attention_head * hf_config.num_key_value_heads
+        )
         tensor_parallel_size = len(loaded_tp_ranks)
-        
+
         q, k, v = torch.split(
             tp_sharded_qkv,
             [
@@ -385,13 +389,17 @@ def reshard_and_split_qkv(
                 kv_hidden_size // tensor_parallel_size,
             ],
             dim=1,
-        ) # New shapes:
+        )  # New shapes:
         # q-->[TP_SIZE, hidden_size/TP_SIZE, hidden_size]
         # k-->[TP_SIZE, kv_hidden_size/TP_SIZE, hidden_size]
         # v-->[TP_SIZE, kv_hidden_size/TP_SIZE, hidden_size]
 
         # Finally, we flatten the first two dimensions merging the TP partitions
-        q, k, v = q.reshape(-1, q.shape[2]), k.reshape(-1, k.shape[2]), v.reshape(-1, k.shape[2])
+        q, k, v = (
+            q.reshape(-1, q.shape[2]),
+            k.reshape(-1, k.shape[2]),
+            v.reshape(-1, k.shape[2]),
+        )
 
         # return these
         state_dict = {}