Add torch_symm_mem server arg

luoyuan.luo · luoyuan.luo · commit 5435e8a7fcce · 2025-09-18T11:49:32.000+08:00
diff --git a/benchmark/lora/launch_server.py b/benchmark/lora/launch_server.py
@@ -28,6 +28,8 @@ def launch_server(args):
         cmd += "--disable-custom-all-reduce"
     if args.enable_mscclpp:
         cmd += "--enable-mscclpp"
+    if args.enable_torch_symm_mem:
+        cmd += "--enable-torch-symm-mem"
     print(cmd)
     os.system(cmd)
 
@@ -70,6 +72,11 @@ def launch_server(args):
         action="store_true",
         help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
     )
+    parser.add_argument(
+        "--enable-torch-symm-mem",
+        action="store_true",
+        help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL.",
+    )
     args = parser.parse_args()
 
     launch_server(args)
diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
@@ -208,7 +208,7 @@ class GroupCoordinator:
     use_pynccl: bool  # a hint of whether to use PyNccl
     use_pymscclpp: bool  # a hint of whether to use PyMsccl
     use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
-    use_symm_mem: bool  # a hint of whether to use SymmMemAllReduce
+    use_torch_symm_mem: bool  # a hint of whether to use SymmMemAllReduce
     use_message_queue_broadcaster: (
         bool  # a hint of whether to use message queue broadcaster
     )
@@ -226,7 +226,7 @@ def __init__(
         use_pynccl: bool,
         use_pymscclpp: bool,
         use_custom_allreduce: bool,
-        use_symm_mem: bool,
+        use_torch_symm_mem: bool,
         use_hpu_communicator: bool,
         use_xpu_communicator: bool,
         use_npu_communicator: bool,
@@ -275,7 +275,7 @@ def __init__(
         self.use_pynccl = use_pynccl
         self.use_pymscclpp = use_pymscclpp
         self.use_custom_allreduce = use_custom_allreduce
-        self.use_symm_mem = use_symm_mem
+        self.use_torch_symm_mem = use_torch_symm_mem
         self.use_hpu_communicator = use_hpu_communicator
         self.use_xpu_communicator = use_xpu_communicator
         self.use_npu_communicator = use_npu_communicator
@@ -343,7 +343,7 @@ def __init__(
                     logger.warning(f"Failed to initialize QuickAllReduce: {e}")
 
         self.symm_mem_comm: Optional[SymmMemCommunicator] = None
-        if self.use_symm_mem and self.world_size > 1:
+        if self.use_torch_symm_mem and self.world_size > 1:
             self.symm_mem_comm = SymmMemCommunicator(
                 group=self.cpu_group,
                 device=self.device,
@@ -453,6 +453,7 @@ def graph_capture(
             # custom allreduce       | enabled | enabled |
             # PyNccl                 | disabled| enabled |
             # PyMscclpp              | disabled| enabled |
+            # TorchSymmMem           | disabled| enabled |
             # torch.distributed      | enabled | disabled|
             #
             # Note: When custom quick allreduce is enabled, a runtime check
@@ -1223,7 +1224,7 @@ def init_world_group(
         use_pynccl=False,
         use_pymscclpp=False,
         use_custom_allreduce=False,
-        use_symm_mem=False,
+        use_torch_symm_mem=False,
         use_hpu_communicator=False,
         use_xpu_communicator=False,
         use_npu_communicator=False,
@@ -1254,7 +1255,7 @@ def init_model_parallel_group(
         use_pynccl=not _is_npu,
         use_pymscclpp=use_mscclpp_allreduce,
         use_custom_allreduce=use_custom_allreduce,
-        use_symm_mem=use_symm_mem_allreduce,
+        use_torch_symm_mem=use_symm_mem_allreduce,
         use_hpu_communicator=True,
         use_xpu_communicator=True,
         use_npu_communicator=True,
diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py
@@ -263,7 +263,7 @@ def initialize_dp_attention(
         use_pynccl=SYNC_TOKEN_IDS_ACROSS_TP,
         use_pymscclpp=False,
         use_custom_allreduce=False,
-        use_symm_mem=False,
+        use_torch_symm_mem=False,
         use_hpu_communicator=False,
         use_xpu_communicator=False,
         use_npu_communicator=False,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -625,7 +625,7 @@ def init_torch_distributed(self):
             dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
         set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
-        set_symm_mem_all_reduce(self.server_args.enable_symm_mem)
+        set_symm_mem_all_reduce(self.server_args.enable_torch_symm_mem)
 
         if not self.is_draft_worker:
             if self.device == "cpu":
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -358,6 +358,7 @@ class ServerArgs:
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     enable_mscclpp: bool = False
+    enable_torch_symm_mem: bool = False
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
@@ -2090,6 +2091,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
         )
+        parser.add_argument(
+            "--enable-torch-symm-mem",
+            action="store_true",
+            help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL.",
+        )
         parser.add_argument(
             "--disable-overlap-schedule",
             action="store_true",