vllm-project · cadedaniel · Aug 22, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -0,0 +1,224 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, EAGLE would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
+
+# max. number of speculative tokens: this corresponds to
+# num_heads in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 4
+
+# precision
+PRECISION = "float32"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -75,6 +75,7 @@
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "MedusaModel": ("medusa", "Medusa"),
+    "EAGLEModel": ("eagle", "EAGLE"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM")
 }

diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
@@ -0,0 +1,86 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.transformers_utils.configs.eagle import EAGLEConfig
+
+
+class EAGLE(nn.Module):
+
+    def __init__(self, config: EAGLEConfig, *args, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+
+        architectures = getattr(self.config.model, "architectures", [])
+        for arch in architectures:
+            model_cls = ModelRegistry.load_model_cls(arch)
+            if model_cls is not None:
+                break
+
+        self.model = model_cls(self.config.model, *args, **kwargs)
+        self.fc = nn.Linear(config.model.hidden_size * 2,
+                            config.model.hidden_size,
+                            bias=False)
+
+        self.token_map = None
+
+    @property
+    def sampler(self):
+        return self.model.sampler
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        previous_hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+
+        tok_embeds = self.model.model.embed_tokens(input_ids)
+        inputs_embeds = self.fc(
+            torch.cat([tok_embeds, previous_hidden_states], dim=-1))
+
+        inputs_embeds[positions == 0] = 0  # masking inputs at position=0
+
+        hidden_states = self.model.model(
+            input_ids=None,
+            inputs_embeds=inputs_embeds,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.model.compute_logits(hidden_states, sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        model_weights = []
+        for name, loaded_weight in weights:
+            if name.startswith("fc."):
+                weight_loader = getattr(self.fc.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.fc.weight, loaded_weight)
+            elif name.startswith("lm_head.") or name.startswith("model."):
+                model_weights.append((name, loaded_weight))
+            else:
+                model_weights.append((f"model.{name}", loaded_weight))
+
+        self.model.load_weights(model_weights)