eth-easl
diff --git a/‎scratchpad/nn/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎scratchpad/nn/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scratchpad/nn/models/llama_moe.py‎
Lines changed: 68 additions & 38 deletions b/‎scratchpad/nn/models/llama_moe.py‎
Lines changed: 68 additions & 38 deletions
@@ -7,6 +7,7 @@
 
 _GENERATION_MODELS = {
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+    "LlamaNaiveMoEForCausalLM": ("llama_naive_moe", "LlamaNaiveMoEForCausalLM"),
     "LlamaMoEForCausalLM": ("llama_moe", "LlamaMoEForCausalLM")
 }
 
 
@@ -27,6 +27,7 @@
 from scratchpad.scheduler.schedule_batch import global_args
 from scratchpad.model_executor.forward_info import ForwardBatch
 from triteia.python.nn.linear import sparse_low_precision_linear
+from triteia.python.ops.matmul.sbmm import sbmm_4bit_2_4_native, sbmm_4bit_2_4_multilaunch, sbmm_4bit_2_4_forloop
 
 class LlamaMLP(nn.Module):
     def __init__(
@@ -65,37 +66,72 @@ def forward(self, x):
         x, _ = self.down_proj(x)
         return x
 
+class LLamaSBmm(nn.Module):
+    def __init__(self, num_experts, infeatures, outfeatures, sbmm_type="naive", groupsize=-1):
+        super().__init__()
+        if groupsize == -1:
+            groupsize = infeatures
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.groupsize = groupsize
+        self.qweight = nn.Parameter(torch.empty((num_experts, self.infeatures // 32, self.outfeatures * 16 // 8), dtype=torch.int32), False)
+        self.meta = nn.Parameter(torch.empty((num_experts, self.outfeatures, self.infeatures // 16), dtype=torch.int16), False)
+        self.scales = nn.Parameter(torch.empty((num_experts, self.infeatures // groupsize, self.outfeatures), dtype=torch.float16), False)
+        self.workspace = nn.Parameter(torch.zeros(num_experts, self.outfeatures // 128 * 16, dtype=torch.int32), False)
+        if sbmm_type == "naive":
+            self.sbmm_func = sbmm_4bit_2_4_native
+        elif sbmm_type == "multilaunch":
+            self.sbmm_func = sbmm_4bit_2_4_multilaunch
+        elif sbmm_type == "forloop":
+            self.sbmm_func = sbmm_4bit_2_4_forloop
+        else:
+            raise NotImplementedError
+
+    def forward(self, x, indices):
+        return self.sbmm_func(
+            qweights=self.qweight.data,
+            xs=x,
+            metas=self.meta.data,
+            ss=self.scales.data,
+            indices=indices)
+
 
 class LlamaCompressedMLP(nn.Module):
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
+        num_experts: int,
+        sbmm_type: str,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
         self.intermediate_size = intermediate_size
         self.hidden_size = hidden_size
-        self.gate_up_proj = sparse_low_precision_linear(
-            hidden_size,
-            intermediate_size * 2,
+        self.gate_up_proj = LLamaSBmm(
+            num_experts=num_experts,
+            infeatures=hidden_size,
+            outfeatures=intermediate_size * 2,
+            sbmm_type=sbmm_type,
         )
-        self.down_proj = sparse_low_precision_linear(
-            intermediate_size,
-            hidden_size,
+        self.down_proj = LLamaSBmm(
+            num_experts=num_experts,
+            infeatures=intermediate_size,
+            outfeatures=hidden_size,
+            sbmm_type=sbmm_type,
         )
 
-    def forward(self, x):
-        assert not x.isnan().any()
-        gate_up = self.gate_up_proj(x)
-        assert not gate_up.isnan().any()
+    def forward(self, x, indices):
+        # assert not x.isnan().any()
+        gate_up = self.gate_up_proj(x, indices)
+        # assert not gate_up.isnan().any()
         d = x.shape[-1] // 2
         x = F.silu(x[..., :d]) * x[..., d:]
-        assert not x.isnan().any()
-        x = self.down_proj(x)
-        assert not x.isnan().any()
+        # assert not x.isnan().any()
+        x = self.down_proj(x, indices)
+        # assert not x.isnan().any()
         return x
 
 class LlamaMoE(nn.Module):
@@ -106,6 +142,7 @@ def __init__(
         hidden_act: str,
         num_experts: int,
         experts_per_token: int,
+        sbmm_type: str,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -119,49 +156,42 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.mlp.EXPERT_ID",
         )
-        self.mlp = nn.ModuleList([
-            LlamaCompressedMLP(
+
+        self.mlp = LlamaCompressedMLP(
+                num_experts=num_experts,
                 hidden_size=hidden_size,
                 intermediate_size=intermediate_size,
                 hidden_act=hidden_act,
                 quant_config=quant_config,
-                prefix=f"{prefix}.mlp.{i}"
-            ) for i in range(num_experts)
-        ])
+                sbmm_type=sbmm_type,
+                prefix=f"{prefix}.mlp."
+            )
         self.gate = nn.Linear(hidden_size, num_experts, bias=False)
 
     def forward(self, x):
-        
         base_y = self.base_mlp(x)
         original_shape = x.shape
         x = x.view(1, *x.shape) if x.dim() == 2 else x
         batch_size, sequence_length, hidden_dim = x.shape
+
         x = x.view(-1, hidden_dim)
         router_logits = self.gate(x)
-
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_weights, self.experts_per_token, dim=-1)
         routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        # we cast back to the input dtype
-        routing_weights = routing_weights.to(x.dtype)
-        assert not routing_weights.isnan().any(), "routing weights have nan"
+        routing_weights = routing_weights.to(x.dtype).T
         final_hidden_states = torch.zeros(
             (batch_size * sequence_length, hidden_dim), dtype=x.dtype, device=x.device
         )
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0).contiguous()
-
-        for expert_idx in range(self.num_experts):
-            expert_layer = self.mlp[expert_idx]
-            current_mask = expert_mask
-            current_mask = current_mask[expert_idx]
-            idx, top_x = torch.where(current_mask)
-            current_state = x[None, top_x].reshape(-1, hidden_dim)
-            assert not torch.isnan(current_state).any(), "current input state has nan"
-            current_hidden_states = expert_layer(current_state) 
-            assert not torch.isnan(current_hidden_states).any(), "current hidden state has nan"
-            current_hidden_states *= routing_weights[top_x, idx, None]
-            if current_hidden_states.nelement() != 0:
-                final_hidden_states.index_add_(0, top_x, current_hidden_states.to(x.dtype))
+        sort_selected_experts, argsort_selected_experts = torch.sort(selected_experts.T, dim=-1)
+        for k in range(self.experts_per_token):
+            current_selected_experts = sort_selected_experts[k]
+            current_routing_weights = routing_weights[k].view(-1, 1)
+            current_argsort_selected_experts = argsort_selected_experts[k]
+            sort_x = x[current_argsort_selected_experts]
+            current_hidden_states = self.mlp(sort_x, current_selected_experts)[current_argsort_selected_experts] * current_routing_weights
+            final_hidden_states += current_hidden_states
+            
         final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
         final_hidden_states = final_hidden_states.view(original_shape)
 
@@ -310,6 +340,7 @@ def __init__(
             quant_config=quant_config,
             num_experts=config.num_experts,
             experts_per_token=config.experts_per_token,
+            sbmm_type=config.sbmm_type,
             prefix=f"{prefix}.moe",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -480,7 +511,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("mlp.EXPERT_ID", "base_mlp")
         ]
         for name, loaded_weight in weights:
-            # print(name)
             assert not loaded_weight.isnan().any()
             # continue
             if "rotary_emb.inv_freq" in name or "projector" in name:
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`
`8`	`8`	`_GENERATION_MODELS = {`
`9`	`9`	`"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),`
	`10`	`+ "LlamaNaiveMoEForCausalLM": ("llama_naive_moe", "LlamaNaiveMoEForCausalLM"),`
`10`	`11`	`"LlamaMoEForCausalLM": ("llama_moe", "LlamaMoEForCausalLM")`
`11`	`12`	`}`
`12`	`13`