Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
1d17465
Add refactored OpenAI API server modules implementation
JustinTong0323 Jun 14, 2025
42bb560
Merge branch 'main' into refactor_oai_server_serving
JustinTong0323 Jun 14, 2025
d9ceddd
feat: add serving_embedding
JustinTong0323 Jun 14, 2025
f8d604b
Refactors request handling in OpenAI endpoints
JustinTong0323 Jun 14, 2025
a86bf27
Adds documentation to OpenAI API endpoints
JustinTong0323 Jun 14, 2025
5ddc8fc
Simplifies getting enable_thinking value
JustinTong0323 Jun 14, 2025
2ddbb40
rename serving_engine to serving_base
JustinTong0323 Jun 14, 2025
26771ad
Merge branch 'main' into refactor_oai_server_serving
JustinTong0323 Jun 14, 2025
4596b52
Makes chat template caching instance-specific
JustinTong0323 Jun 14, 2025
47d54dc
Refactors logprobs processing
JustinTong0323 Jun 14, 2025
8ac4349
Update python/sglang/srt/entrypoints/openai/protocol.py
JustinTong0323 Jun 14, 2025
00b202c
Improve test cases for eagle infer (#7173)
merrymercy Jun 14, 2025
fb4ae05
fix CI
JustinTong0323 Jun 14, 2025
81f5e41
Merge branch 'main' into refactor_oai_server_serving
JustinTong0323 Jun 14, 2025
2a10db7
Merge branch 'main' into refactor_oai_server_serving
JustinTong0323 Jun 14, 2025
3b28fdb
Removes unused utility functions
JustinTong0323 Jun 14, 2025
012bcb5
Refactors request validation for OpenAI endpoints
JustinTong0323 Jun 15, 2025
27341ae
Improves OpenAI serving base class logic
JustinTong0323 Jun 15, 2025
286751a
Refactors error handling for OpenAI endpoints
JustinTong0323 Jun 15, 2025
50d57d1
Refactors request ID generation
JustinTong0323 Jun 15, 2025
960f917
Removes RequestContext
JustinTong0323 Jun 15, 2025
30663a5
Simplifies enable_thinking handling and remove unused functions
JustinTong0323 Jun 15, 2025
eb6784d
Refactors sampling parameter building
JustinTong0323 Jun 15, 2025
47da102
Renames OpenAI serving handler classes
JustinTong0323 Jun 15, 2025
177efdc
Merge branch 'main' into refactor_oai_server_serving
JustinTong0323 Jun 15, 2025
c5a60e0
cleanup docs and imports
JustinTong0323 Jun 15, 2025
d433e43
Fixes usage calculation in streaming mode
JustinTong0323 Jun 15, 2025
ba42ea1
Refactors error response handling in OpenAIServingBase
JustinTong0323 Jun 16, 2025
48586bf
Apply suggestions from code review
JustinTong0323 Jun 16, 2025
3e03b74
Refactors test fixtures for clarity and remove some tests
JustinTong0323 Jun 16, 2025
ac908e1
Enables tool call constraint in sampling params
JustinTong0323 Jun 16, 2025
69e41f7
move the `text = content["text"]` in serving_chat for Better readability
JustinTong0323 Jun 16, 2025
590db9a
lint
JustinTong0323 Jun 16, 2025
4c140c8
remove redundant logic
JustinTong0323 Jun 16, 2025
7190e6f
logic for generate_completion_prompt
JustinTong0323 Jun 16, 2025
40e97fc
Add comments back
JustinTong0323 Jun 16, 2025
84f6037
Merge branch 'main' into refactor_oai_server_serving
JustinTong0323 Jun 16, 2025
b95a288
fix tests
JustinTong0323 Jun 16, 2025
cc28f37
fix lint
JustinTong0323 Jun 16, 2025
ea30a8c
Merge branch 'main' into refactor_oai_server_serving
zhyncs Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 0 additions & 48 deletions python/sglang/srt/entrypoints/openai/__init__.py
Original file line number Diff line number Diff line change
@@ -1,48 +0,0 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
OpenAI-compatible API server module for SGLang.

This module provides OpenAI-compatible API endpoints that allow existing OpenAI client
applications to seamlessly work with SGLang models. The implementation includes:

Key Features:
- Full OpenAI API compatibility for chat completions, text completions, and embeddings
- Streaming support for real-time response generation
- Batch processing capabilities for multiple requests
- Function calling and tool use support
- Multimodal input support (text, images, audio)
- Advanced reasoning capabilities with separate reasoning content
- Custom sampling parameters and constraints (regex, JSON schema, EBNF)
- LoRA adapter support for fine-tuned models
- Cache reporting and token usage tracking

Supported Endpoints:
- /v1/chat/completions - Chat-based completions with conversation history
- /v1/completions - Text completions for single prompts
- /v1/embeddings - Text/multimodal embeddings generation
- /v1/models - Model listing and information

The module is structured with separate handlers for each endpoint type, all inheriting
from a common base class that provides shared functionality like request validation,
error handling, and response formatting.

Architecture:
- OpenAIServingBase: Abstract base class for all endpoint handlers
- ChatCompletionHandler: Handles chat completion requests
- CompletionHandler: Handles text completion requests
- EmbeddingHandler: Handles embedding requests
- Protocol classes: Pydantic models for request/response validation
- Utility functions: Shared helpers for formatting and validation
"""
178 changes: 64 additions & 114 deletions python/sglang/srt/entrypoints/openai/serving_base.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,5 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Base serving engine for OpenAI API endpoints.

This module provides the foundational classes and request handling patterns
used by all OpenAI API endpoint implementations. It establishes a common
architecture for request processing, validation, and response generation.

Key Components:
- RequestContext: Tracks request state and metadata throughout processing
- OpenAIServingBase: Abstract base class for all endpoint handlers
- Common request handling patterns with proper error handling
- Validation integration for request parameters
- Streaming and non-streaming response support

Architecture Pattern:
All endpoint handlers inherit from OpenAIServingBase and implement:
1. _convert_to_internal_request: Transform OpenAI request to SGLang format
2. _handle_streaming_request: Process streaming requests
3. _handle_non_streaming_request: Process non-streaming requests

This ensures consistent behavior across all endpoints while allowing
endpoint-specific customization.
"""

import json
import logging
import time
import uuid
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union
Expand All @@ -45,49 +8,16 @@
from fastapi.responses import StreamingResponse

from sglang.srt.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
EmbeddingRequest,
ErrorResponse,
OpenAIServingRequest,
UsageInfo,
)
from sglang.srt.entrypoints.openai.utils import create_error_response
from sglang.srt.entrypoints.openai.validation import get_validation_rules
from sglang.srt.managers.io_struct import GenerateReqInput
from sglang.srt.managers.tokenizer_manager import TokenizerManager

logger = logging.getLogger(__name__)


class RequestContext:
"""Context object for tracking request state throughout the pipeline"""

def __init__(
self,
raw_request: Request,
openai_request: OpenAIServingRequest,
request_id: str,
):
self.raw_request = raw_request
self.openai_request = openai_request
self.request_id = request_id
self.start_time = time.time()
self.metadata: Dict[str, Any] = {}

def elapsed_time(self) -> float:
"""Get elapsed time since request started"""
return time.time() - self.start_time

def add_metadata(self, key: str, value: Any) -> None:
"""Add metadata to the request context"""
self.metadata[key] = value

def get_metadata(self, key: str, default: Any = None) -> Any:
"""Get metadata from the request context"""
return self.metadata.get(key, default)


# Base class for specific endpoint handlers
class OpenAIServingBase(ABC):
"""Abstract base class for OpenAI endpoint handlers"""
Expand All @@ -101,54 +31,44 @@ async def handle_request(
"""Handle the specific request type with common pattern"""
try:
# Validate request
error = self._validate_request(request)
if error:
return error

# Create request context
ctx = RequestContext(
raw_request=raw_request,
openai_request=request,
request_id=self._generate_request_id(request),
)
error_msg = self._validate_request(request)
if error_msg:
return self.create_error_response(error_msg)

# Convert to internal format
adapted_request, processed_request = self._convert_to_internal_request(
[request], [ctx.request_id]
[request], [self._generate_request_id_base(request)]
)

# Check if this handler supports streaming
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
if hasattr(request, "stream") and request.stream:
return await self._handle_streaming_request(
adapted_request, processed_request, ctx
adapted_request, processed_request, raw_request
)
else:
return await self._handle_non_streaming_request(
adapted_request, processed_request, ctx
adapted_request, processed_request, raw_request
)

except Exception as e:
logger.error(f"Error in request: {e}")
return create_error_response(
return self.create_error_response(
message=f"Internal server error: {str(e)}",
err_type="InternalServerError",
status_code=500,
)

def _generate_request_id(self, request: OpenAIServingRequest) -> str:
@abstractmethod
def _request_id_prefix(self) -> str:
"""Generate request ID based on request type"""
pass

def _generate_request_id_base(self, request: OpenAIServingRequest) -> str:
"""Generate request ID based on request type"""
# Default implementation - can be overridden
if rid := getattr(request, "rid", None):
return rid

# Determine prefix based on request type
prefix_mapping = {
ChatCompletionRequest: "chatcmpl",
CompletionRequest: "cmpl",
EmbeddingRequest: "embd",
}
prefix = prefix_mapping.get(type(request), "req")
return f"{prefix}-{uuid.uuid4()}"
return f"{self._request_id_prefix()}{uuid.uuid4().hex}"

@abstractmethod
def _convert_to_internal_request(
Expand All @@ -161,37 +81,41 @@ def _convert_to_internal_request(
"""Convert OpenAI request to internal format"""
pass

@abstractmethod
async def _handle_streaming_request(
self,
adapted_request: GenerateReqInput,
request: OpenAIServingRequest,
ctx: RequestContext,
raw_request: Request,
) -> StreamingResponse:
"""Handle streaming request"""
pass
"""Handle streaming request

Override this method in child classes that support streaming requests.
"""
return self.create_error_response(
message=f"{self.__class__.__name__} does not support streaming requests",
err_type="NotImplementedError",
status_code=501,
)

@abstractmethod
async def _handle_non_streaming_request(
self,
adapted_request: GenerateReqInput,
request: OpenAIServingRequest,
ctx: RequestContext,
raw_request: Request,
) -> Union[Any, ErrorResponse]:
"""Handle non-streaming request"""
pass
"""Handle non-streaming request

Override this method in child classes that support non-streaming requests.
"""
return self.create_error_response(
message=f"{self.__class__.__name__} does not support non-streaming requests",
err_type="NotImplementedError",
status_code=501,
)

def _validate_request(
self, request: OpenAIServingRequest
) -> Optional[ErrorResponse]:
def _validate_request(self, request: OpenAIServingRequest) -> Optional[str]:
"""Validate request"""
validation_rules = get_validation_rules(request)
for rule in validation_rules:
param_value = rule.param_getter(request)
error_msg = rule.validator_func(param_value)
if error_msg:
return create_error_response(error_msg, param=rule.param_name)
return None
pass

def _calculate_streaming_usage_base(
self,
Expand Down Expand Up @@ -219,3 +143,29 @@ def _calculate_streaming_usage_base(
total_tokens=total_prompt_tokens + total_completion_tokens,
prompt_tokens_details=prompt_tokens_details,
)

def create_error_response(
self,
message: str,
err_type: str = "BadRequestError",
status_code: int = 400,
param: Optional[str] = None,
) -> ErrorResponse:
"""Create an error response"""
return ErrorResponse(
object="error",
message=message,
type=err_type,
param=param,
code=status_code,
)

def create_streaming_error_response(
self,
message: str,
err_type: str = "BadRequestError",
status_code: int = 400,
) -> str:
"""Create a streaming error response"""
error = self.create_error_response(message, err_type, status_code)
return json.dumps({"error": error.model_dump()})
Loading
Loading