Merge pull request #3 from yubofredwang/add-spec-dec-top-1

hebiao064 · web-flow · commit 77d6ef4b9040 · 2025-04-07T23:14:17.000-07:00
Add integration test for spec decode top k = 1
diff --git a/test/srt/test_fa3.py b/test/srt/test_fa3.py
@@ -7,6 +7,8 @@
 from sglang.srt.utils import get_device_sm, kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
     DEFAULT_MLA_MODEL_NAME_FOR_TEST,
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -123,11 +125,56 @@ def get_server_args(cls):
 class TestFlashAttention3SpeculativeDecode(BaseFlashAttentionTest):
     """Test FlashAttention3 with speculative decode enabled."""
 
+    model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
+
     @classmethod
     def get_server_args(cls):
         args = super().get_server_args()
+        args.extend(
+            [
+                "--cuda-graph-max-bs",
+                "2",
+                "--speculative-algorithm",
+                "EAGLE3",
+                "--speculative-draft",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                "3",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "3",
+                "--dtype",
+                "float16",
+            ]
+        )
         return args
 
+    def test_gsm8k(self):
+        """
+        Override the test_gsm8k to further test for average speculative accept length.
+        """
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=DATA_PATH,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 1.5)
+
 
 if __name__ == "__main__":
     unittest.main()