OpenPipe
diff --git a/‎dev/yes-no-maybe.ipynb
Lines changed: 5 additions & 1 deletion b/‎dev/yes-no-maybe.ipynb
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/art/__init__.py
Lines changed: 7 additions & 0 deletions b/‎src/art/__init__.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/art/dev/model.py
Lines changed: 12 additions & 0 deletions b/‎src/art/dev/model.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/art/local/backend.py
Lines changed: 11 additions & 7 deletions b/‎src/art/local/backend.py
Lines changed: 11 additions & 7 deletions
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,6 +46,10 @@
     "    name=\"001\",\n",
     "    project=\"yes-no-maybe\",\n",
     "    base_model=\"Qwen/Qwen2.5-7B-Instruct\",\n",
+    "    _internal_config=art.dev.InternalModelConfig(\n",
+    "        _decouple_vllm_and_unsloth=True,\n",
+    "        engine_args=art.dev.EngineArgs(gpu_memory_utilization=0.7),\n",
+    "    ),\n",
     ")\n",
     "await model.register(backend)\n",
     "\n",
 
@@ -9,6 +9,13 @@
 if os.environ.get("IMPORT_UNSLOTH", "0") == "1":
     import unsloth  # type: ignore # noqa: F401
 
+if os.environ.get("IMPORT_PEFT", "0") == "1":
+    # torch.cuda.MemPool doesn't currently support expandable_segments which is used in sleep mode
+    conf = os.environ["PYTORCH_CUDA_ALLOC_CONF"].split(",")
+    if "expandable_segments:True" in conf:
+        conf.remove("expandable_segments:True")
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ",".join(conf)
+
 from . import dev
 from .backend import Backend
 from .batches import trajectory_group_batches
 
@@ -38,13 +38,21 @@ def get_model_config(
         max_lora_rank=8,
         use_async=True,
     )
+    if config.get("_decouple_vllm_and_unsloth", False):
+        init_args["fast_inference"] = False
+        init_args.pop("disable_log_stats")
+        init_args.pop("enable_prefix_caching")
+        init_args.pop("gpu_memory_utilization")
+        init_args.pop("max_lora_rank")
+        init_args.pop("use_async")
     engine_args = EngineArgs(
         disable_log_requests=True,
         # Multi-step processing is not supported for the Xformers attention backend
         # which is the fallback for devices with compute capability < 8.0
         num_scheduler_steps=(
             16
             if config.get("torchtune_args") is None
+            and not config.get("_decouple_vllm_and_unsloth", False)
             and torch.cuda.get_device_capability()[0] >= 8
             else 1
         ),
@@ -59,6 +67,8 @@ def get_model_config(
             engine_args["model"] = last_checkpoint_dir
     elif config.get("torchtune_args") is not None:
         engine_args["model"] = base_model
+    if config.get("_decouple_vllm_and_unsloth", False):
+        engine_args["model"] = base_model
     peft_args = PeftArgs(
         r=8,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
         target_modules=[
@@ -105,6 +115,7 @@ def get_model_config(
         peft_args=peft_args,
         trainer_args=trainer_args,
         torchtune_args=torchtune_args,
+        _decouple_vllm_and_unsloth=config.get("_decouple_vllm_and_unsloth", False),
     )
 
 
@@ -123,6 +134,7 @@ class InternalModelConfig(TypedDict, total=False):
     peft_args: "PeftArgs"
     trainer_args: "TrainerArgs"
     torchtune_args: TorchtuneArgs | None
+    _decouple_vllm_and_unsloth: bool
 
 
 class InitArgs(TypedDict, total=False):
 
@@ -112,29 +112,33 @@ async def register(
     async def _get_service(self, model: TrainableModel) -> ModelService:
         from ..torchtune.service import TorchtuneService
         from ..unsloth.service import UnslothService
+        from ..unsloth.decoupled_service import DecoupledUnslothService
 
         if model.name not in self._services:
             config = dev.get_model_config(
                 base_model=model.base_model,
                 output_dir=get_model_dir(model=model, art_path=self._path),
                 config=model._internal_config,
             )
-            service_class = (
-                TorchtuneService
-                if config.get("torchtune_args") is not None
-                else UnslothService
-            )
+            if config.get("torchtune_args") is not None:
+                service_class = TorchtuneService
+            elif config.get("_decouple_vllm_and_unsloth", False):
+                service_class = DecoupledUnslothService
+            else:
+                service_class = UnslothService
             self._services[model.name] = service_class(
                 model_name=model.name,
                 base_model=model.base_model,
                 config=config,
                 output_dir=get_model_dir(model=model, art_path=self._path),
             )
-
             if not self._in_process:
                 # Kill all "model-service" processes to free up GPU memory
                 subprocess.run(["pkill", "-9", "model-service"])
-                if isinstance(self._services[model.name], UnslothService):
+                if isinstance(
+                    self._services[model.name],
+                    (UnslothService, DecoupledUnslothService),
+                ):
                     # To enable sleep mode, import peft before unsloth
                     # Unsloth will issue warnings, but everything appears to be okay
                     if config.get("engine_args", {}).get("enable_sleep_mode", False):