37
37
import torch
38
38
import uvloop
39
39
40
- from sglang .srt .code_completion_parser import load_completion_template_for_openai_api
41
40
from sglang .srt .entrypoints .EngineBase import EngineBase
42
41
from sglang .srt .managers .data_parallel_controller import (
43
42
run_data_parallel_controller_process ,
58
57
UpdateWeightsFromTensorReqInput ,
59
58
)
60
59
from sglang .srt .managers .scheduler import run_scheduler_process
60
+ from sglang .srt .managers .template_manager import TemplateManager
61
61
from sglang .srt .managers .tokenizer_manager import TokenizerManager
62
- from sglang .srt .openai_api .adapter import (
63
- guess_chat_template_name_from_model_path ,
64
- load_chat_template_for_openai_api ,
65
- )
66
62
from sglang .srt .server_args import PortArgs , ServerArgs
67
63
from sglang .srt .torch_memory_saver_adapter import TorchMemorySaverAdapter
68
64
from sglang .srt .utils import (
@@ -123,12 +119,13 @@ def __init__(self, **kwargs):
123
119
logger .info (f"{ server_args = } " )
124
120
125
121
# Launch subprocesses
126
- tokenizer_manager , scheduler_info = _launch_subprocesses (
122
+ tokenizer_manager , template_manager , scheduler_info = _launch_subprocesses (
127
123
server_args = server_args ,
128
124
port_args = port_args ,
129
125
)
130
126
self .server_args = server_args
131
127
self .tokenizer_manager = tokenizer_manager
128
+ self .template_manager = template_manager
132
129
self .scheduler_info = scheduler_info
133
130
134
131
context = zmq .Context (2 )
@@ -647,7 +644,7 @@ def sigquit_handler(signum, frame):
647
644
648
645
def _launch_subprocesses (
649
646
server_args : ServerArgs , port_args : Optional [PortArgs ] = None
650
- ) -> Tuple [TokenizerManager , Dict ]:
647
+ ) -> Tuple [TokenizerManager , TemplateManager , Dict ]:
651
648
"""
652
649
Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
653
650
"""
@@ -732,7 +729,7 @@ def _launch_subprocesses(
732
729
733
730
if os .getenv ("SGLANG_BLOCK_NONZERO_RANK_CHILDREN" ) == "0" :
734
731
# When using `Engine` as a Python API, we don't want to block here.
735
- return None , None
732
+ return None , None , None
736
733
737
734
launch_dummy_health_check_server (server_args .host , server_args .port )
738
735
@@ -741,7 +738,7 @@ def _launch_subprocesses(
741
738
logger .error (
742
739
f"Scheduler or DataParallelController { proc .pid } terminated with { proc .exitcode } "
743
740
)
744
- return None , None
741
+ return None , None , None
745
742
746
743
# Launch detokenizer process
747
744
detoken_proc = mp .Process (
@@ -755,15 +752,15 @@ def _launch_subprocesses(
755
752
756
753
# Launch tokenizer process
757
754
tokenizer_manager = TokenizerManager (server_args , port_args )
758
- if server_args .chat_template :
759
- load_chat_template_for_openai_api (
760
- tokenizer_manager , server_args .chat_template , server_args .model_path
761
- )
762
- else :
763
- guess_chat_template_name_from_model_path (server_args .model_path )
764
755
765
- if server_args .completion_template :
766
- load_completion_template_for_openai_api (server_args .completion_template )
756
+ # Initialize templates
757
+ template_manager = TemplateManager ()
758
+ template_manager .initialize_templates (
759
+ tokenizer_manager = tokenizer_manager ,
760
+ model_path = server_args .model_path ,
761
+ chat_template = server_args .chat_template ,
762
+ completion_template = server_args .completion_template ,
763
+ )
767
764
768
765
# Wait for the model to finish loading
769
766
scheduler_infos = []
@@ -787,4 +784,4 @@ def _launch_subprocesses(
787
784
# Assume all schedulers have the same scheduler_info
788
785
scheduler_info = scheduler_infos [0 ]
789
786
tokenizer_manager .max_req_input_len = scheduler_info ["max_req_input_len" ]
790
- return tokenizer_manager , scheduler_info
787
+ return tokenizer_manager , template_manager , scheduler_info
0 commit comments