Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default

- name: Benchmark online latency
timeout-minutes: 10
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1970,6 +1970,7 @@ def is_fa3_default_architecture(hf_config):
"Llama4ForConditionalGeneration",
"LlamaForCausalLM",
"MistralForCausalLM",
"MixtralForCausalLM",
"Gemma2ForCausalLM",
"Gemma3ForConditionalGeneration",
}
Expand Down
2 changes: 1 addition & 1 deletion test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class TestFile:
TestFile("test_retract_decode.py", 54),
TestFile("test_server_args.py", 1),
TestFile("test_skip_tokenizer_init.py", 117),
TestFile("test_srt_engine.py", 237),
TestFile("test_srt_engine.py", 261),
TestFile("test_srt_endpoint.py", 130),
TestFile("test_torch_compile.py", 76),
TestFile("test_torch_compile_moe.py", 172),
Expand Down
15 changes: 7 additions & 8 deletions test/srt/test_bench_one_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,22 @@
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
CustomTestCase,
get_bool_env_var,
is_in_ci,
run_bench_one_batch,
write_github_step_summary,
)


class TestBenchOneBatch(CustomTestCase):
def test_bs1(self):
def test_bs1_default(self):
output_throughput = run_bench_one_batch(
DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
)

if is_in_ci():
write_github_step_summary(
f"### test_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
f"### test_bs1_default (llama-3.1-8b)\n"
f"output_throughput: {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 135)

Expand All @@ -32,9 +31,9 @@ def test_moe_tp2_bs1(self):
if is_in_ci():
write_github_step_summary(
f"### test_moe_tp2_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
f"output_throughput: {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 124)
self.assertGreater(output_throughput, 125)

def test_torch_compile_tp2_bs1(self):
output_throughput = run_bench_one_batch(
Expand All @@ -45,9 +44,9 @@ def test_torch_compile_tp2_bs1(self):
if is_in_ci():
write_github_step_summary(
f"### test_torch_compile_tp2_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
f"output_throughput: {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 225)
self.assertGreater(output_throughput, 220)


if __name__ == "__main__":
Expand Down
10 changes: 5 additions & 5 deletions test/srt/test_bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_offline_throughput_with_triton_attention_backend(self):
f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3600)
self.assertGreater(res["output_throughput"], 3700)

def test_offline_throughput_default_fp8(self):
res = run_bench_serving(
Expand All @@ -113,7 +113,7 @@ def test_offline_throughput_default_fp8(self):
f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 4200)
self.assertGreater(res["output_throughput"], 4300)

def test_online_latency_default(self):
res = run_bench_serving(
Expand All @@ -126,7 +126,7 @@ def test_online_latency_default(self):
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_default\n"
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
)
self.assertLess(res["median_e2e_latency_ms"], 11000)
self.assertLess(res["median_ttft_ms"], 86)
Expand Down Expand Up @@ -161,8 +161,8 @@ def test_online_latency_eagle(self):
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_eagle\n"
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length : {res["accept_length"]:.2f} \n'
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length: {res["accept_length"]:.2f} \n'
)
self.assertLess(res["median_e2e_latency_ms"], 900)
self.assertGreater(res["accept_length"], 3.0)
Expand Down
16 changes: 11 additions & 5 deletions test/srt/test_full_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from types import SimpleNamespace

import requests
import torch

from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
Expand Down Expand Up @@ -49,7 +48,7 @@ def test_gsm8k(self):
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

self.assertGreater(metrics["accuracy"], 0.94)
self.assertGreater(metrics["accuracy"], 0.935)


class TestBenchOneBatch(CustomTestCase):
Expand All @@ -58,11 +57,11 @@ def test_bs1(self):
FULL_DEEPSEEK_V3_MODEL_PATH,
["--trust-remote-code", "--tp", "8", "--cuda-graph-max-bs", "2"],
)
print(f"output_throughput : {output_throughput:.2f} token/s")
print(f"{output_throughput=:.2f} token/s")

if is_in_ci():
write_github_step_summary(
f"### test_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
f"### test_bs1 (deepseek-v3)\n" f"{output_throughput=:.2f} token/s\n"
)
self.assertGreater(output_throughput, 70)

Expand Down Expand Up @@ -121,6 +120,13 @@ def test_gsm8k(self):
print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 3.2)

if is_in_ci():
write_github_step_summary(
f"### test_gsm8k (deepseek-v3)\n"
f'{metrics["accuracy"]=:.3f}\n'
f"{avg_spec_accept_length=:.2f}\n"
)


if __name__ == "__main__":
unittest.main()
Loading