Run latency and throughput benchmark for Qwen3 and Gemma3 (#86)

huydhn · web-flow · commit 64925c5fa997 · 2025-10-01T21:02:18.000-07:00
Signed-off-by: Huy Do &lt;huydhn@gmail.com&gt;
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -2,8 +2,8 @@ name: vLLM Benchmark
 
 on:
   schedule:
-    # Run every 6 hours
-    - cron: '0 */6 * * *'
+    # Run every 12 hours
+    - cron: '0 */12 * * *'
   workflow_dispatch:
     inputs:
       vllm_branch:
diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
@@ -105,5 +105,27 @@
             "num_iters": 15,
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "latency_gemma_3_27b_it_tp8",
+        "parameters": {
+            "model": "google/gemma-3-27b-it",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "latency_qwen3_30b_a3b_tp8",
+        "parameters": {
+            "model": "Qwen/Qwen3-30B-A3B",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -115,5 +115,29 @@
             "backend": "vllm",
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "throughput_gemma_3_27b_it_tp8",
+        "parameters": {
+            "model": "google/gemma-3-27b-it",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "throughput_qwen3_30b_a3b_tp8",
+        "parameters": {
+            "model": "Qwen/Qwen3-30B-A3B",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
     }
 ]