LLMServe · Toseic · Nov 28, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 17, 2024
diff --git a/simdistserve/README.md b/simdistserve/README.md
@@ -48,10 +48,31 @@ Ideally you should get the following result:
 Best per GPU rate: 1.56
 Best config: pp_cross=1, tp_prefill=2, pp_prefill=1, tp_decode=1, pp_decode=1
 ```
-
+### Ratio search
+search the best config of the ratio between prefill and decode instances:
+```bash
+python -m simdistserve.simulate_ratio \
+    --prefill-tp 8 \
+    --prefill-pp 1 \
+    --decode-tp 8 \
+    --decode-pp 1 \
+    --kv-cache-mem-per-gpu 64 \
+    --kv-transfer-bw 600 \
+    --model-type "facebook/opt-66b" \
+    --workload sharegpt --backend distserve \
+    --prefill-target 200 --decode-target 100 \
+    --prefill-percentage 90 --decode-percentage 90 \
+    --max-per-gpu-rate 5 \
+    --esp 0.25 \
+    --N 300
+```
+Output:
+```text
+Best config: prefill_instance=15, decode_instance=8, per_gpu_rate=4.84375
+```
 ## Architecture
 
-The simulator is written on top of `simpy`, a discrete event simulator built natively in Python. 
+The simulator is written on top of `simpy`, a discrete event simulator built natively in Python.
 
 In the high level, our simulator is composed of the following core components (under the `base` and `clusters` module):
 

diff --git a/simdistserve/base/request.py b/simdistserve/base/request.py
@@ -7,6 +7,8 @@
 E_DO_PREFILL = "do_prefill"
 E_WAIT_DECODE = "wait_decode"
 E_DO_DECODE = "do_decode"
+E_WAIT_KVCACHE_MIGRATION = "wait_kvcache_migration"
+E_DO_KVCACHE_MIGRATION = "do_kvcache_migration"
 E_FINISH_PREFILL = "finish_prefill"
 E_FINISH_DECODE = "finish_decode"
 E_EXIT_SYSTEM = "exit_system"
@@ -61,6 +63,11 @@ def __init__(
         # set this value if a request belongs to a particular chunk
         # The last worker in the pipeline unset this value at a chunk's end.
         self.chunk_id = None
+        # after the request is finished prefill, `kvcache_generated` should be set to `True`.
+        self.kvcache_generated = False
+        self.prefill_is_done = False
+        self.kvcache_is_transferred = False
+        self.prefill_worker = None
 
     @property
     def current_context_len(self):
@@ -88,6 +95,12 @@ def wait_decode(self, wid=None):
 
     def do_decode(self, wid=None):
         self._log_event(E_DO_DECODE, wid=wid)
+
+    def wait_kvcache_migration(self, wid=None):
+        self._log_event(E_WAIT_KVCACHE_MIGRATION, wid=wid)
+
+    def do_kvcache_migration(self, wid=None):
+        self._log_event(E_DO_KVCACHE_MIGRATION, wid=wid)
 
     def _reset_chunked_prefill_metadata(self):
         """Reset the metadata of chunked prefill."""
@@ -111,6 +124,7 @@ def finish_prefill(self, is_finished_one_round=False, wid=None, next_wid=None):
         # Reset counter to 0
         # TODO: Should we do self.counter += 1?
         self.counter = 0
+        self.prefill_is_done = True
         # Hack to ensure "wait_decode" appears at least once.
         self.wait_decode(wid=next_wid)
         if not self.should_finish():