Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions simdistserve/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,31 @@ Ideally you should get the following result:
Best per GPU rate: 1.56
Best config: pp_cross=1, tp_prefill=2, pp_prefill=1, tp_decode=1, pp_decode=1
```

### Ratio search
search the best config of the ratio between prefill and decode instances:
```bash
python -m simdistserve.simulate_ratio \
--prefill-tp 8 \
--prefill-pp 1 \
--decode-tp 8 \
--decode-pp 1 \
--kv-cache-mem-per-gpu 64 \
--kv-transfer-bw 600 \
--model-type "facebook/opt-66b" \
--workload sharegpt --backend distserve \
--prefill-target 200 --decode-target 100 \
--prefill-percentage 90 --decode-percentage 90 \
--max-per-gpu-rate 5 \
--esp 0.25 \
--N 300
```
Output:
```text
Best config: prefill_instance=15, decode_instance=8, per_gpu_rate=4.84375
```
## Architecture

The simulator is written on top of `simpy`, a discrete event simulator built natively in Python.
The simulator is written on top of `simpy`, a discrete event simulator built natively in Python.

In the high level, our simulator is composed of the following core components (under the `base` and `clusters` module):

Expand Down
14 changes: 14 additions & 0 deletions simdistserve/base/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
E_DO_PREFILL = "do_prefill"
E_WAIT_DECODE = "wait_decode"
E_DO_DECODE = "do_decode"
E_WAIT_KVCACHE_MIGRATION = "wait_kvcache_migration"
E_DO_KVCACHE_MIGRATION = "do_kvcache_migration"
E_FINISH_PREFILL = "finish_prefill"
E_FINISH_DECODE = "finish_decode"
E_EXIT_SYSTEM = "exit_system"
Expand Down Expand Up @@ -61,6 +63,11 @@ def __init__(
# set this value if a request belongs to a particular chunk
# The last worker in the pipeline unset this value at a chunk's end.
self.chunk_id = None
# after the request is finished prefill, `kvcache_generated` should be set to `True`.
self.kvcache_generated = False
self.prefill_is_done = False
self.kvcache_is_transferred = False
self.prefill_worker = None

@property
def current_context_len(self):
Expand Down Expand Up @@ -88,6 +95,12 @@ def wait_decode(self, wid=None):

def do_decode(self, wid=None):
self._log_event(E_DO_DECODE, wid=wid)

def wait_kvcache_migration(self, wid=None):
self._log_event(E_WAIT_KVCACHE_MIGRATION, wid=wid)

def do_kvcache_migration(self, wid=None):
self._log_event(E_DO_KVCACHE_MIGRATION, wid=wid)

def _reset_chunked_prefill_metadata(self):
"""Reset the metadata of chunked prefill."""
Expand All @@ -111,6 +124,7 @@ def finish_prefill(self, is_finished_one_round=False, wid=None, next_wid=None):
# Reset counter to 0
# TODO: Should we do self.counter += 1?
self.counter = 0
self.prefill_is_done = True
# Hack to ensure "wait_decode" appears at least once.
self.wait_decode(wid=next_wid)
if not self.should_finish():
Expand Down
Loading