sgl-project · zhyncs · Jun 17, 2025 · Jun 14, 2025 · Jun 14, 2025 · Jun 14, 2025
@@ -1,48 +0,0 @@
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""
-OpenAI-compatible API server module for SGLang.
-
-This module provides OpenAI-compatible API endpoints that allow existing OpenAI client
-applications to seamlessly work with SGLang models. The implementation includes:
-
-Key Features:
-- Full OpenAI API compatibility for chat completions, text completions, and embeddings
-- Streaming support for real-time response generation
-- Batch processing capabilities for multiple requests
-- Function calling and tool use support
-- Multimodal input support (text, images, audio)
-- Advanced reasoning capabilities with separate reasoning content
-- Custom sampling parameters and constraints (regex, JSON schema, EBNF)
-- LoRA adapter support for fine-tuned models
-- Cache reporting and token usage tracking
-
-Supported Endpoints:
-- /v1/chat/completions - Chat-based completions with conversation history
-- /v1/completions - Text completions for single prompts
-- /v1/embeddings - Text/multimodal embeddings generation
-- /v1/models - Model listing and information
-
-The module is structured with separate handlers for each endpoint type, all inheriting
-from a common base class that provides shared functionality like request validation,
-error handling, and response formatting.
-
-Architecture:
-- OpenAIServingBase: Abstract base class for all endpoint handlers
-- ChatCompletionHandler: Handles chat completion requests
-- CompletionHandler: Handles text completion requests
-- EmbeddingHandler: Handles embedding requests
-- Protocol classes: Pydantic models for request/response validation
-- Utility functions: Shared helpers for formatting and validation
-"""

@@ -1,42 +1,5 @@
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""
-Base serving engine for OpenAI API endpoints.
-
-This module provides the foundational classes and request handling patterns
-used by all OpenAI API endpoint implementations. It establishes a common
-architecture for request processing, validation, and response generation.
-
-Key Components:
-- RequestContext: Tracks request state and metadata throughout processing
-- OpenAIServingBase: Abstract base class for all endpoint handlers
-- Common request handling patterns with proper error handling
-- Validation integration for request parameters
-- Streaming and non-streaming response support
-
-Architecture Pattern:
-All endpoint handlers inherit from OpenAIServingBase and implement:
-1. _convert_to_internal_request: Transform OpenAI request to SGLang format
-2. _handle_streaming_request: Process streaming requests
-3. _handle_non_streaming_request: Process non-streaming requests
-
-This ensures consistent behavior across all endpoints while allowing
-endpoint-specific customization.
-"""
-
+import json
 import logging
-import time
 import uuid
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Union
@@ -45,49 +8,16 @@
 from fastapi.responses import StreamingResponse
 
 from sglang.srt.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    EmbeddingRequest,
     ErrorResponse,
     OpenAIServingRequest,
     UsageInfo,
 )
-from sglang.srt.entrypoints.openai.utils import create_error_response
-from sglang.srt.entrypoints.openai.validation import get_validation_rules
 from sglang.srt.managers.io_struct import GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 
 logger = logging.getLogger(__name__)
 
 
-class RequestContext:
-    """Context object for tracking request state throughout the pipeline"""
-
-    def __init__(
-        self,
-        raw_request: Request,
-        openai_request: OpenAIServingRequest,
-        request_id: str,
-    ):
-        self.raw_request = raw_request
-        self.openai_request = openai_request
-        self.request_id = request_id
-        self.start_time = time.time()
-        self.metadata: Dict[str, Any] = {}
-
-    def elapsed_time(self) -> float:
-        """Get elapsed time since request started"""
-        return time.time() - self.start_time
-
-    def add_metadata(self, key: str, value: Any) -> None:
-        """Add metadata to the request context"""
-        self.metadata[key] = value
-
-    def get_metadata(self, key: str, default: Any = None) -> Any:
-        """Get metadata from the request context"""
-        return self.metadata.get(key, default)
-
-
 # Base class for specific endpoint handlers
 class OpenAIServingBase(ABC):
     """Abstract base class for OpenAI endpoint handlers"""
@@ -101,54 +31,44 @@ async def handle_request(
         """Handle the specific request type with common pattern"""
         try:
             # Validate request
-            error = self._validate_request(request)
-            if error:
-                return error
-
-            # Create request context
-            ctx = RequestContext(
-                raw_request=raw_request,
-                openai_request=request,
-                request_id=self._generate_request_id(request),
-            )
+            error_msg = self._validate_request(request)
+            if error_msg:
+                return self.create_error_response(error_msg)
 
             # Convert to internal format
             adapted_request, processed_request = self._convert_to_internal_request(
-                [request], [ctx.request_id]
+                [request], [self._generate_request_id_base(request)]
             )
 
-            # Check if this handler supports streaming
+            # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
             if hasattr(request, "stream") and request.stream:
                 return await self._handle_streaming_request(
-                    adapted_request, processed_request, ctx
+                    adapted_request, processed_request, raw_request
                 )
             else:
                 return await self._handle_non_streaming_request(
-                    adapted_request, processed_request, ctx
+                    adapted_request, processed_request, raw_request
                 )
 
         except Exception as e:
             logger.error(f"Error in request: {e}")
-            return create_error_response(
+            return self.create_error_response(
                 message=f"Internal server error: {str(e)}",
                 err_type="InternalServerError",
                 status_code=500,
             )
 
-    def _generate_request_id(self, request: OpenAIServingRequest) -> str:
+    @abstractmethod
+    def _request_id_prefix(self) -> str:
+        """Generate request ID based on request type"""
+        pass
+
+    def _generate_request_id_base(self, request: OpenAIServingRequest) -> str:
         """Generate request ID based on request type"""
-        # Default implementation - can be overridden
         if rid := getattr(request, "rid", None):
             return rid
 
-        # Determine prefix based on request type
-        prefix_mapping = {
-            ChatCompletionRequest: "chatcmpl",
-            CompletionRequest: "cmpl",
-            EmbeddingRequest: "embd",
-        }
-        prefix = prefix_mapping.get(type(request), "req")
-        return f"{prefix}-{uuid.uuid4()}"
+        return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
 
     @abstractmethod
     def _convert_to_internal_request(
@@ -161,37 +81,41 @@ def _convert_to_internal_request(
         """Convert OpenAI request to internal format"""
         pass
 
-    @abstractmethod
     async def _handle_streaming_request(
         self,
         adapted_request: GenerateReqInput,
         request: OpenAIServingRequest,
-        ctx: RequestContext,
+        raw_request: Request,
     ) -> StreamingResponse:
-        """Handle streaming request"""
-        pass
+        """Handle streaming request
+
+        Override this method in child classes that support streaming requests.
+        """
+        return self.create_error_response(
+            message=f"{self.__class__.__name__} does not support streaming requests",
+            err_type="NotImplementedError",
+            status_code=501,
+        )
 
-    @abstractmethod
     async def _handle_non_streaming_request(
         self,
         adapted_request: GenerateReqInput,
         request: OpenAIServingRequest,
-        ctx: RequestContext,
+        raw_request: Request,
     ) -> Union[Any, ErrorResponse]:
-        """Handle non-streaming request"""
-        pass
+        """Handle non-streaming request
+
+        Override this method in child classes that support non-streaming requests.
+        """
+        return self.create_error_response(
+            message=f"{self.__class__.__name__} does not support non-streaming requests",
+            err_type="NotImplementedError",
+            status_code=501,
+        )
 
-    def _validate_request(
-        self, request: OpenAIServingRequest
-    ) -> Optional[ErrorResponse]:
+    def _validate_request(self, request: OpenAIServingRequest) -> Optional[str]:
         """Validate request"""
-        validation_rules = get_validation_rules(request)
-        for rule in validation_rules:
-            param_value = rule.param_getter(request)
-            error_msg = rule.validator_func(param_value)
-            if error_msg:
-                return create_error_response(error_msg, param=rule.param_name)
-        return None
+        pass
 
     def _calculate_streaming_usage_base(
         self,
@@ -219,3 +143,29 @@ def _calculate_streaming_usage_base(
             total_tokens=total_prompt_tokens + total_completion_tokens,
             prompt_tokens_details=prompt_tokens_details,
         )
+
+    def create_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+        param: Optional[str] = None,
+    ) -> ErrorResponse:
+        """Create an error response"""
+        return ErrorResponse(
+            object="error",
+            message=message,
+            type=err_type,
+            param=param,
+            code=status_code,
+        )
+
+    def create_streaming_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+    ) -> str:
+        """Create a streaming error response"""
+        error = self.create_error_response(message, err_type, status_code)
+        return json.dumps({"error": error.model_dump()})