Fix/lora delta serving (#19)

xzyaoi · web-flow · commit 6b372efcacf1 · 2025-01-03T14:40:22.000+01:00
* wip: fix output correctness

* fix output correctness

* Remove debug print statement in toppings_manager.py
diff --git a/scratchpad/managers/toppings_manager.py b/scratchpad/managers/toppings_manager.py
@@ -124,7 +124,6 @@ def init_toppings(self):
         self.origin_target_modules = set()
         for name, top in self.available_toppings.items():
             self.configs[name] = ToppingConfig(topping_type=top[0], path=top[1])
-
             self.origin_target_modules = set(self.origin_target_modules) | set(
                 self.configs[name].hf_config["target_modules"]
             )
@@ -134,6 +133,8 @@ def init_toppings(self):
                 self.base_model.get_module_name(module)
                 for module in self.origin_target_modules
             }
+            # remove down_proj from target modules
+            logger.info(f"Target modules: {self.target_modules}")
         else:
             logger.warning(
                 "WARNING: get_module_name() is not defined, "
@@ -166,6 +167,7 @@ def init_toppings(self):
         self.lora_id = {}
         self.deltas: List[DeltaAdapter] = []
         self.delta_id = {}
+
         for name in self.available_toppings.keys():
             t_type = self.available_toppings[name][0]
             logger.info(f"Loading {t_type} {name}")
@@ -189,10 +191,15 @@ def init_toppings(self):
                 self.deltas[-1].initialize_weights()
 
         # misc lora configs
-        self.max_lora_dim = max(
-            [x.hf_config["r"] for x in self.configs.values() if "r" in x.hf_config]
-        )
-        self.scaling = self.loras[0].scaling
+        self.max_lora_dim = [
+            x.hf_config["r"] for x in self.configs.values() if "r" in x.hf_config
+        ]
+        if len(self.max_lora_dim) == 0:
+            self.max_lora_dim = 0
+            self.scaling = 0
+        else:
+            self.max_lora_dim = max(self.max_lora_dim)
+            self.scaling = self.loras[0].scaling
         # FIXME remove the restrictions
         assert all(
             x.hf_config["r"] == self.max_lora_dim
@@ -215,6 +222,9 @@ def print_available_toppings(self):
     def set_topping_module(self, module_name, module):
         topping_module = get_topping_layer(module)
         replace_submodule(self.base_model, module_name, topping_module)
+        logger.info(
+            f"Replaced {module_name} with topping module {type(topping_module)}"
+        )
         return topping_module
 
     def prepare_topping_batch(self, forward_batch: ForwardBatch):
@@ -288,7 +298,6 @@ def prepare_topping_batch(self, forward_batch: ForwardBatch):
             dtype=torch.int64,
             device=forward_batch.input_ids.device,
         )
-        print(f"weight_indices: {weight_indices}")
         for module_name, module in self.topping_modules:
             layer_id = get_layer_id(module_name)
             if "lm_head" in module_name:
@@ -327,6 +336,42 @@ def prepare_topping_batch(self, forward_batch: ForwardBatch):
                         self.scales_buffer["kv_proj"][layer_id][:len_deltas],
                     ),
                 )
+            elif "down_proj" in module_name:
+                weight_name = self.get_weight_name(module_name, 0)
+                module.set_topping_info(
+                    bs,
+                    weight_indices,
+                    lora_buffer=(
+                        (
+                            self.A_buffer[weight_name][layer_id][:len_loras]
+                            if weight_name in self.A_buffer
+                            else None
+                        ),
+                        (
+                            self.B_buffer[weight_name][layer_id][:len_loras]
+                            if weight_name in self.B_buffer
+                            else None
+                        ),
+                    ),
+                    delta_buffer=(
+                        (
+                            self.qweight_buffer[weight_name][layer_id][:len_deltas]
+                            if weight_name in self.qweight_buffer
+                            else None
+                        ),
+                        (
+                            self.meta_buffer[weight_name][layer_id][:len_deltas]
+                            if weight_name in self.meta_buffer
+                            else None
+                        ),
+                        (
+                            self.scales_buffer[weight_name][layer_id][:len_deltas]
+                            if weight_name in self.scales_buffer
+                            else None
+                        ),
+                    ),
+                    debug=False,
+                )
             else:
                 weight_name = self.get_weight_name(module_name, 0)
                 module.set_topping_info(
@@ -375,6 +420,7 @@ def load_topping(self, uid, buffer_id):
         """
         This function loads topping from CPU -> GPU memory
         """
+
         if uid not in self.available_toppings:
             logger.error(f"Topping {uid} not registered")
             return
@@ -420,6 +466,7 @@ def _load_delta(self, uid, buffer_id):
 
         for i in range(num_layer):
             layer_weights = self.deltas[self.delta_id[uid]].layers[i].weights
+            # load to buffer space
             for name, weights in layer_weights.items():
                 if (
                     "qkv_proj" in name
@@ -445,7 +492,7 @@ def _load_delta(self, uid, buffer_id):
                         self.scales_buffer[kv_proj_name][i][buffer_id].copy_(
                             weights[:, q_dim:]
                         )
-                    else:
+                    elif "meta" in name:
                         q_proj_name = "q_proj"
                         kv_proj_name = "kv_proj"
                         q_dim = self.meta_buffer[q_proj_name][i][buffer_id].shape[0]
@@ -455,23 +502,30 @@ def _load_delta(self, uid, buffer_id):
                         self.meta_buffer[kv_proj_name][i][buffer_id].copy_(
                             weights[q_dim:, :]
                         )
+                    else:
+                        print("Unknown delta weight name: {name}")
                 else:
                     if "qweight" in name:
                         weight_name = self.get_delta_weight_name(name)
                         if weight_name:
                             self.qweight_buffer[weight_name][i][buffer_id].copy_(
                                 weights
                             )
+                        else:
+                            print("Unknown delta weight name: {name}")
+
                     elif "scales" in name:
                         weight_name = self.get_delta_weight_name(name)
                         if weight_name:
                             self.scales_buffer[weight_name][i][buffer_id].copy_(weights)
+
                     elif "meta" in name:
                         weight_name = self.get_delta_weight_name(name)
                         if weight_name:
                             self.meta_buffer[weight_name][i][buffer_id].copy_(weights)
                     else:
                         print("Unknown delta weight name: {name}")
+                        raise ValueError(f"Unknown delta weight name: {name}")
 
         for name, outside_module in self.deltas[
             self.delta_id[uid]
diff --git a/scratchpad/memory/topping_pool.py b/scratchpad/memory/topping_pool.py
@@ -141,32 +141,32 @@ def __init__(
             stacked_dim = dimensions[1] * stack_factor
 
             self.qweight_buffer[module] = [
-                torch.zeros(
+                torch.empty(
                     self.max_toppings_per_batch,
                     dimensions[0] // (pack_factor * sparse_factor * 2),
                     stacked_dim * 2,
                     dtype=delta_dtypes["qweight"],
                     device="cuda",
                 )
-                for _ in range(num_layers)
+                for i in range(num_layers)
             ]
             self.meta_buffer[module] = [
-                torch.zeros(
+                torch.empty(
                     self.max_toppings_per_batch,
                     stacked_dim,
                     dimensions[0] // (pack_factor * sparse_factor),
                     dtype=delta_dtypes["meta"],
                     device="cuda",
                 )
-                for _ in range(num_layers)
+                for i in range(num_layers)
             ]
             self.scales_buffer[module] = [
-                torch.zeros(
+                torch.empty(
                     self.max_toppings_per_batch,
                     1,
                     stacked_dim,
                     dtype=delta_dtypes["scales"],
                     device="cuda",
                 )
-                for _ in range(num_layers)
+                for i in range(num_layers)
             ]
diff --git a/scratchpad/model_executor/forward_info.py b/scratchpad/model_executor/forward_info.py
@@ -212,5 +212,4 @@ def init_new(
         # Init lora information
         if model_runner.server_args.enable_toppings:
             model_runner.topping_manager.prepare_topping_batch(ret)
-
         return ret
diff --git a/scratchpad/nn/models/llama.py b/scratchpad/nn/models/llama.py
@@ -302,11 +302,11 @@ def forward(
     def get_hidden_dim(self, module_name):
         if module_name in ["q_proj", "o_proj", "qkv_proj"]:
             return self.config.hidden_size, self.config.hidden_size
-        elif module_name in ["kv_proj"]:
+        elif module_name in ["kv_proj", "k_proj", "v_proj"]:
             return self.config.hidden_size, self.config.hidden_size // (
                 self.config.num_attention_heads // self.config.num_key_value_heads
             )
-        elif module_name == "gate_up_proj":
+        elif module_name in ["gate_up_proj", "up_proj", "gate_proj"]:
             return self.config.hidden_size, self.config.intermediate_size
         elif module_name == "down_proj":
             return self.config.intermediate_size, self.config.hidden_size
diff --git a/scratchpad/nn/toppings/topping_layer.py b/scratchpad/nn/toppings/topping_layer.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 from typing import Union
 from scratchpad.nn.layers.vocab_parallel_embedding import (
     ParallelLMHead,
@@ -137,6 +138,7 @@ def forward(self, input_: torch.Tensor):
         qweight_dim = self.qweight_buffer.shape[2] // 2
         metas_dim = self.metas_buffer.shape[1] // 2
         scales_dim = self.scales_buffer.shape[2] // 2
+
         for i in range(2):
             output = ldmm(
                 indices=self.weight_indices,
@@ -198,18 +200,6 @@ def set_topping_info(
             self.meta_buffer_kv = torch.zeros(0, 0, 0)
             self.scales_buffer_kv = torch.zeros(0, 0, 0)
 
-        # q,k,v have the same input dimensions
-        # k,v have the same output dimensions
-        # q has a different output dimension than k,v
-
-        # (A_buffer_qkv: bsz, dim1, rank*2)
-        # (B_buffer_q: bsz, rank, dim2*2)
-        # (B_buffer_kv: bsz, rank, dim3*2)
-
-        # (qweight_buffer: bsz,_, _*3)
-        # (meta_buffer: bsz,_, _*3)
-        # (scales_buffer: bsz, _, _*3)
-
     def forward(self, input_: torch.Tensor):
         base_output = self.base_layer(input_)[0]
         rank = self.A_buffer_qkv.shape[2] // 3
@@ -248,25 +238,27 @@ def forward(self, input_: torch.Tensor):
                     ],
                 )
                 base_output[:, i * b_dim_kv : (i + 1) * b_dim_kv] += output
-
         return base_output, None
 
 
 class RowParallelLinearWithTopping(BaseLayerWithTopping):
     def __init__(self, base_layer: RowParallelLinear, config: Dict) -> None:
         super().__init__(base_layer, config)
 
-    def set_topping_info(self, bs, weight_indices, lora_buffer=None, delta_buffer=None):
+    def set_topping_info(
+        self, bs, weight_indices, lora_buffer=None, delta_buffer=None, debug=False
+    ):
         self.weight_indices = weight_indices
         self.bs = bs
-        if lora_buffer != None:
+        self.debug = debug
+        if lora_buffer is not None:
             self.A_buffer = lora_buffer[0]
             self.B_buffer = lora_buffer[1]
         else:
             self.A_buffer = torch.zeros(0, 0, 0)
             self.B_buffer = torch.zeros(0, 0, 0)
 
-        if delta_buffer != None:
+        if delta_buffer is not None:
             self.qweight_buffer = delta_buffer[0]
             self.metas_buffer = delta_buffer[1]
             self.scales_buffer = delta_buffer[2]
@@ -276,7 +268,7 @@ def set_topping_info(self, bs, weight_indices, lora_buffer=None, delta_buffer=No
             self.scales_buffer = torch.zeros(0, 0, 0)
 
     def forward(self, input_: torch.Tensor):
-        base_output = torch.matmul(input_, self.base_layer.weight.T)
+        base_output = F.linear(input_, self.base_layer.weight, self.base_layer.bias)
         delta_output = ldmm(
             indices=self.weight_indices,
             x=input_,
@@ -285,16 +277,10 @@ def forward(self, input_: torch.Tensor):
             DeltaW=self.qweight_buffer,
             metas=self.metas_buffer,
             ss=self.scales_buffer,
+            debug=self.debug,
         )
-        print(f"weight_indices: {self.weight_indices}")
-        print(f"A_buffer.shape: {self.A_buffer.shape}")
-        print(f"base_output.shape: {base_output.shape}")
-        print(f"delta_output.shape: {delta_output.shape}")
-        print(f"base: {base_output}")
-        print(f"max delta: {torch.max(abs(delta_output))}")
-        assert base_output.shape == delta_output.shape
+        # assert base_output.shape == delta_output.shape
         output_ = base_output + delta_output
-        # output_ = base_output
         if not self.base_layer.skip_bias_add:
             output = (
                 output_ + self.base_layer.bias
@@ -364,7 +350,7 @@ def _get_logits(
             assert len(unique_indices) == 1, f"Prefill stage only supports one index"
             w_idx = unique_indices[0]
             if w_idx == -1:
-                w = weight.T
+                w = weight
             else:
                 w = self.delta_buffer[w_idx]
             output = nn.functional.linear(last_hidden, w)
diff --git a/scratchpad/nn/toppings/topping_module.py b/scratchpad/nn/toppings/topping_module.py
@@ -96,6 +96,7 @@ def initialize_weights(self):
             delta_config = json.load(f)
             self.pack_factor = 32 // delta_config["compress_config"]["bits"]
             self.sparse_factor = int(1 / delta_config["compress_config"]["sparsity"])
+
         weight_path = os.path.join(local_path, "deltazip-compressed.safetensors")
         with st.safe_open(weight_path, framework="torch", device="cpu") as f:
             keys = f.keys()
diff --git a/scripts/serve_llama_1b_with_toppings.sh b/scripts/serve_llama_1b_with_toppings.sh
@@ -1,4 +1,10 @@
 export PROMETHEUS_MULTIPROC_DIR=.local
 sp serve meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0 --port 8080 \
---enable-system-controller --use-heterogeneous-pool \
---init-toppings lora:ketchup123/llama-3.2-1B-instruct-gsm8k:ketchup123/llama-3.2-1B-instruct-gsm8k
+--enable-system-controller \
+--use-heterogeneous-pool \
+--enable-toppings \
+--init-toppings lora:ketchup123/llama-3.2-1B-instruct-gsm8k:ketchup123/llama-3.2-1B-instruct-gsm8k,delta:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs-1,delta:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs:deltazip/meta-llama.Llama-3.2-1B-Instruct.4b_2n4m_128bs-2 \
+--attention-backend triton \
+--sampling-backend pytorch \
+--max-toppings-per-batch 2 \
+--disable-cuda-graph
diff --git a/scripts/serve_llama_1b_with_toppings_torch.sh b/scripts/serve_llama_1b_with_toppings_torch.sh
@@ -1,7 +1,9 @@
 export PROMETHEUS_MULTIPROC_DIR=.local
-sp serve meta-llama/Llama-3.2-3B-Instruct --host 0.0.0.0 --port 8080 \
+sp serve meta-llama/Llama-3.2-3B --host 0.0.0.0 --port 8080 \
 --enable-system-controller \
+--tokenizer-path meta-llama/Llama-3.2-3B-Instruct \
 --use-heterogeneous-pool \
+--enable-toppings \
 --init-toppings lora:eltorio/Llama-3.2-3B-appreciation:eltorio/Llama-3.2-3B-appreciation-1,lora:eltorio/Llama-3.2-3B-appreciation:eltorio/Llama-3.2-3B-appreciation-2,delta:deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs:deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs-1,delta:deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs:deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs-2 \
 --attention-backend triton \
 --sampling-backend pytorch \
diff --git a/tools/utils/test_concurrency.py b/tools/utils/test_concurrency.py
@@ -15,6 +15,7 @@ def main(args):
         # "eltorio/Llama-3.2-3B-appreciation-2",
         "deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs-1",
         "deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs-2",
+        # "meta-llama/Llama-3.2-3B"
     ]
     prompts = np.random.choice(prompts, args.num_req, replace=True)
     models = np.random.choice(models, args.num_req, replace=True)
@@ -29,6 +30,7 @@ def main(args):
     ]
     responses = asyncio.run(make_requests(args.endpoint, reqs))
     for resp in responses:
+        print(f"---")
         print(resp["choices"][0]["message"]["content"])
 
 

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@ def main(args):`
`15`	`15`	`# "eltorio/Llama-3.2-3B-appreciation-2",`
`16`	`16`	`"deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs-1",`
`17`	`17`	`"deltazip/meta-llama.Llama-3.2-3B-Instruct.4b_2n4m_128bs-2",`
	`18`	`+ # "meta-llama/Llama-3.2-3B"`
`18`	`19`	`]`
`19`	`20`	`prompts = np.random.choice(prompts, args.num_req, replace=True)`
`20`	`21`	`models = np.random.choice(models, args.num_req, replace=True)`
`@@ -29,6 +30,7 @@ def main(args):`
`29`	`30`	`]`
`30`	`31`	`responses = asyncio.run(make_requests(args.endpoint, reqs))`
`31`	`32`	`for resp in responses:`
	`33`	`+ print(f"---")`
`32`	`34`	`print(resp["choices"][0]["message"]["content"])`
`33`	`35`
`34`	`36`