eth-easl
diff --git a/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 2 additions & 2 deletions b/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scratchpad/managers/tp_worker.py‎
Lines changed: 9 additions & 1 deletion b/‎scratchpad/managers/tp_worker.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎scratchpad/memory/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎scratchpad/memory/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scratchpad/memory/pool.py‎
Lines changed: 103 additions & 1 deletion b/‎scratchpad/memory/pool.py‎
Lines changed: 103 additions & 1 deletion
diff --git a/‎scratchpad/model_executor/model_runner.py‎
Lines changed: 7 additions & 1 deletion b/‎scratchpad/model_executor/model_runner.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎scratchpad/model_executor/utils.py‎
Lines changed: 103 additions & 2 deletions b/‎scratchpad/model_executor/utils.py‎
Lines changed: 103 additions & 2 deletions
diff --git a/‎tools/benchmark/bench_perf.py‎
Lines changed: 4 additions & 3 deletions b/‎tools/benchmark/bench_perf.py‎
Lines changed: 4 additions & 3 deletions
@@ -6,8 +6,8 @@ LABEL org.opencontainers.image.licenses=Apache-2.0
 LABEL org.opencontainers.image.architecture=amd64
 
 ENV DEBIAN_FRONTEND=noninteractive
-ENV TRITEIA_COMPUTE_CAP=80
-ENV TORCH_CUDA_ARCH_LIST="8.0"
+ENV TRITEIA_COMPUTE_CAP=90
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 9.0 9.0a"
 ENV FLASHINFER_ENABLE_AOT="1"
 
 RUN apt update && apt upgrade -y
 
@@ -10,8 +10,13 @@
 from scratchpad.model_executor.model_runner import ModelRunner
 from scratchpad.config.model_config import ModelConfig
 from scratchpad.scheduler.schedule_batch import ModelWorkerBatch
-from scratchpad.memory.het_pool import HeterogeneousMHATokenToKVPool
 from scratchpad.model_executor.forward_info import ForwardBatch
+from scratchpad.memory import (
+    ReqToTokenPool,
+    HeterogeneousMHATokenToKVPool,
+    TokenToKVPoolAllocator,
+)
+
 from .structs import UpdateWeightReqInput
 from typing import Optional
 from scratchpad.server.args import global_args
@@ -27,9 +32,12 @@ def __init__(
         server_args: ServerArgs,
         nccl_port: int,
         dp_rank: Optional[int] = 0,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None,
     ):
         # Parse args
         logger.info(f"Initalizing model worker on GPU {gpu_id}, tp_rank: {tp_rank}")
+
         self.tp_rank = tp_rank
         self.server_args = server_args
         # Init model and tokenizer
 
@@ -0,0 +1,5 @@
+from .pool import *
+from .het_pool import *
+from .radix_cache import *
+from .topping_pool import *
+from .chunk_cache import *
@@ -1,4 +1,4 @@
-from abc import ABC, abstractmethod
+import abc
 from typing import List, Tuple, Union, TYPE_CHECKING
 import torch
 from scratchpad.utils import logger
@@ -144,6 +144,108 @@ def set_kv_buffer(
         raise NotImplementedError()
 
 
+class KVCache(abc.ABC):
+    @abc.abstractmethod
+    def get_key_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_value_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def set_kv_buffer(
+        self,
+        layer: "RadixAttention",
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_flat_data(self, indices):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def transfer(self, indices, flat_data):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def transfer_per_layer(self, indices, flat_data, layer_id):
+        raise NotImplementedError()
+
+    def register_layer_transfer_counter(self, layer_transfer_counter):
+        self.layer_transfer_counter = layer_transfer_counter
+
+
+class TokenToKVPoolAllocator:
+    """An allocator managing the indices to kv cache data."""
+
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+    ):
+        self.size = size
+        self.dtype = dtype
+        self.device = device
+        self.page_size = 1
+
+        self.free_slots = None
+        self.is_not_in_free_group = True
+        self.free_group = []
+        self.clear()
+
+        self._kvcache = kvcache
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def get_kvcache(self):
+        return self._kvcache
+
+    def alloc(self, need_size: int):
+        if need_size > len(self.free_slots):
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+        return select_index
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+
+        if self.is_not_in_free_group:
+            self.free_slots = torch.concat((self.free_slots, free_index))
+        else:
+            self.free_group.append(free_index)
+
+    def free_group_begin(self):
+        self.is_not_in_free_group = False
+        self.free_group = []
+
+    def free_group_end(self):
+        self.is_not_in_free_group = True
+        if self.free_group:
+            self.free(torch.concat(self.free_group))
+
+    def clear(self):
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_slots = torch.arange(
+            1, self.size + 1, dtype=torch.int64, device=self.device
+        )
+        self.is_in_free_group = False
+        self.free_group = []
+
+
 class MHATokenToKVPool(BaseTokenToKVPool):
     def __init__(
         self,
 
@@ -29,8 +29,10 @@
     MLATokenToKVPool,
     ReqToTokenPool,
 )
-from scratchpad.memory.het_pool import (
+from scratchpad.memory import (
     HeterogeneousMHATokenToKVPool,
+    ReqToTokenPool,
+    TokenToKVPoolAllocator,
 )
 from scratchpad.model_executor.forward_info import ForwardBatch
 from scratchpad.model_executor.speculative.spec_info import SpeculativeAlgorithm
@@ -59,6 +61,8 @@ def __init__(
         tp_size: int,
         nccl_port: int,
         server_args: ServerArgs,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None,
     ):
         # Parse args
         self.model_config = model_config
@@ -72,6 +76,8 @@ def __init__(
         self.is_generation = model_config.is_generation
         self.is_multimodal = model_config.is_multimodal
         self.spec_algorithm = SpeculativeAlgorithm.NONE
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
         logger.info(f"model config: {model_config}")
         # Model-specific adjustment
         if (
 
@@ -1,19 +1,27 @@
 import torch
 import glob
 import contextlib
-from typing import List, Generator, Tuple, Type
+from typing import List, Generator, Tuple, Type, Protocol
 from tqdm import tqdm
 import json
 import os
-from scratchpad.utils import snapshot_download, get_lock, DisabledTqdm
+from scratchpad.utils import (
+    snapshot_download,
+    get_lock,
+    DisabledTqdm,
+    is_pin_memory_available,
+)
 from safetensors.torch import safe_open
 from scratchpad.nn.models import ModelRegistry
 from scratchpad.config import ModelConfig, LoadConfig
 from scratchpad.nn.quantization import get_quantization_config, QuantizationConfig
 import huggingface_hub
 from torch import nn
+from torch.func import functional_call
 
 _BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+_CPU_OFFLOAD_BYTES = 0
+_CPU_OFFLOAD_MAX_BYTES = 0
 
 
 @contextlib.contextmanager
@@ -208,3 +216,96 @@ def get_quant_config(
                 )
 
     return quant_cls.from_config(config)
+
+
+def set_cpu_offload_max_bytes(max_bytes: int) -> None:
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    _CPU_OFFLOAD_BYTES = 0
+    _CPU_OFFLOAD_MAX_BYTES = max_bytes
+
+
+def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
+    device = next(module.parameters()).device
+
+    if device == torch.device("cpu"):
+        return module
+
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+        return module
+
+    pin_memory = is_pin_memory_available()
+    # offload parameters to CPU
+    # use pin_memory if possible, which helps cudagraph capture speed
+    offloaded_parameters = False
+    for p in module.parameters():
+        if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+            # we use per-parameter offloading
+            # one module might have some parameters offloaded and some not
+            break
+
+        # `torch.empty_like` does not support `pin_memory` argument
+        cpu_data = torch.empty_strided(
+            size=p.data.size(),
+            stride=p.data.stride(),
+            dtype=p.data.dtype,
+            layout=p.data.layout,
+            device="cpu",
+            pin_memory=pin_memory,
+        )
+        cpu_data.copy_(p.data)
+        p.data = cpu_data
+        _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
+        offloaded_parameters = True
+
+    if offloaded_parameters:
+        original_forward = module.forward
+
+        def forward(*args, **kwargs):
+            module.forward = original_forward
+            device_state = {
+                # here we blindly call `to(device)`
+                # if the parameter is already on the device, it will be a no-op
+                k: v.to(device, non_blocking=True)
+                for k, v in module.state_dict().items()
+            }
+            output = functional_call(module, device_state, args=args, kwargs=kwargs)
+            module.forward = forward
+            return output
+
+        module.forward = forward
+
+    return module
+
+
+class LayerFn(Protocol):
+    def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module:
+        ...
+
+
+def add_prefix(name: str, prefix: str) -> str:
+    """Add a weight path prefix to a module name.
+
+    Args:
+        name: base module name.
+        prefix: weight prefix str to added to the front of `name` concatenated with `.`.
+
+    Returns:
+        The string `prefix.name` if prefix is non-empty, otherwise just `name`.
+    """
+    return name if not prefix else f"{prefix}.{name}"
+
+
+def make_layers(
+    num_hidden_layers: int,
+    layer_fn: LayerFn,
+    prefix: str = "",
+) -> Tuple[int, int, torch.nn.ModuleList]:
+    """Make a list of layers with the given layer function"""
+    modules = torch.nn.ModuleList(
+        [
+            maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
+            for idx in range(num_hidden_layers)
+        ]
+    )
+    return modules
@@ -64,7 +64,7 @@ async def run_benchmark(
     goodput_config_dict: Dict[str, float],
     max_concurrency: Optional[int] = None,
 ):
-    system_info = await async_request_sp_sysinfo(args.endpoint)
+    # system_info = await async_request_sp_sysinfo(args.endpoint)
     pbar = tqdm(total=len(input_requests))
     tasks: List[asyncio.Task] = []
     semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
@@ -101,7 +101,7 @@ async def limited_request_func(request_func_input, pbar):
         output_file = write_benchmark(
             metrics,
             args.output,
-            system_info,
+            {},
             args,
             outputs,
         )
@@ -131,6 +131,7 @@ def benchmark(args):
             except Exception as e:
                 print("Server is not ready. Please start the server first.")
                 time.sleep(5)
+    print(f"Server is ready. Starting benchmark...")
     asyncio.run(
         run_benchmark(
             args,
@@ -209,7 +210,7 @@ def benchmark(args):
         "--wait-until-ready",
         action="store_true",
         help="Wait until the server is ready before starting the benchmark.",
-        default=True,
+        default=False,
     )
     args = parser.parse_args()
     benchmark(args)