InftyAI · InftyAI-Agent · Feb 16, 2025 · Feb 16, 2025
diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
@@ -101,16 +101,16 @@ type FlavorName string
 type Flavor struct {
 	// Name represents the flavor name, which will be used in model claim.
 	Name FlavorName `json:"name"`
-	// Requests defines the required accelerators to serve the model for each replica,
-	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+	// Limits defines the required accelerators to serve the model for each replica,
+	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
 	// the resource requirements for each replica, usually equals to the TP size.
 	// Not recommended to set the cpu and memory usage here:
 	// - if using playground, you can define the cpu/mem usage at backendConfig.
 	// - if using inference service, you can define the cpu/mem at the container resources.
-	// However, if you define the same accelerator requests at playground/service as well,
-	// the requests will be overwritten by the flavor requests.
+	// However, if you define the same accelerator resources at playground/service as well,
+	// the resources will be overwritten by the flavor limit here.
 	// +optional
-	Requests v1.ResourceList `json:"requests,omitempty"`
+	Limits v1.ResourceList `json:"limits,omitempty"`
 	// NodeSelector represents the node candidates for Pod placements, if a node doesn't
 	// meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
 	// If nodeSelector is empty, it means every node is a candidate.
@@ -129,11 +129,15 @@ type Flavor struct {
 type InferenceConfig struct {
 	// Flavors represents the accelerator requirements to serve the model.
 	// Flavors are fungible following the priority represented by the slice order.
+	// This is used both in Playground and Inference Service.
 	// +kubebuilder:validation:MaxItems=8
 	// +optional
 	Flavors []Flavor `json:"flavors,omitempty"`
 	// SharedMemorySize represents the size of /dev/shm required in the runtime of
 	// inference workload.
+	// This is only used in Playground. Inference Service can configure the shared memory
+	// directly in PodSpec.
+	// +optional
 	SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
 }
 

diff --git a/api/core/v1alpha1/zz_generated.deepcopy.go b/api/core/v1alpha1/zz_generated.deepcopy.go
diff --git a/client-go/applyconfiguration/core/v1alpha1/flavor.go b/client-go/applyconfiguration/core/v1alpha1/flavor.go
diff --git a/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go b/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go
diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml
@@ -54,13 +54,31 @@ spec:
                     description: |-
                       Flavors represents the accelerator requirements to serve the model.
                       Flavors are fungible following the priority represented by the slice order.
+                      This is used both in Playground and Inference Service.
                     items:
                       description: |-
                         Flavor defines the accelerator requirements for a model and the necessary parameters
                         in autoscaling. Right now, it will be used in two places:
                         - Pod scheduling with node selectors specified.
                         - Cluster autoscaling with essential parameters provided.
                       properties:
+                        limits:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: |-
+                            Limits defines the required accelerators to serve the model for each replica,
+                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
+                            the resource requirements for each replica, usually equals to the TP size.
+                            Not recommended to set the cpu and memory usage here:
+                            - if using playground, you can define the cpu/mem usage at backendConfig.
+                            - if using inference service, you can define the cpu/mem at the container resources.
+                            However, if you define the same accelerator resources at playground/service as well,
+                            the resources will be overwritten by the flavor limit here.
+                          type: object
                         name:
                           description: Name represents the flavor name, which will
                             be used in model claim.
@@ -83,23 +101,6 @@ spec:
                             with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
                             Preset parameters: TP, PP, INSTANCE-TYPE.
                           type: object
-                        requests:
-                          additionalProperties:
-                            anyOf:
-                            - type: integer
-                            - type: string
-                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                            x-kubernetes-int-or-string: true
-                          description: |-
-                            Requests defines the required accelerators to serve the model for each replica,
-                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
-                            the resource requirements for each replica, usually equals to the TP size.
-                            Not recommended to set the cpu and memory usage here:
-                            - if using playground, you can define the cpu/mem usage at backendConfig.
-                            - if using inference service, you can define the cpu/mem at the container resources.
-                            However, if you define the same accelerator requests at playground/service as well,
-                            the requests will be overwritten by the flavor requests.
-                          type: object
                       required:
                       - name
                       type: object
@@ -112,6 +113,8 @@ spec:
                     description: |-
                       SharedMemorySize represents the size of /dev/shm required in the runtime of
                       inference workload.
+                      This is only used in Playground. Inference Service can configure the shared memory
+                      directly in PodSpec.
                     pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                     x-kubernetes-int-or-string: true
                 type: object

diff --git a/docs/examples/hostpath/model.yaml b/docs/examples/hostpath/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/huggingface/model.yaml b/docs/examples/huggingface/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/modelscope/model.yaml b/docs/examples/modelscope/model.yaml
@@ -11,5 +11,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/multi-nodes/model.yaml b/docs/examples/multi-nodes/model.yaml
@@ -10,13 +10,13 @@ spec:
   inferenceConfig:
     flavors:
       - name: a100-80gb
-        requests:
+        limits:
           nvidia.com/gpu: 8 # single node request
         params:
           TP: "8" # 8 GPUs per node, equal to nvidia.com/gpu
           PP: "2" # 2 nodes
   # - name: h100
-  #   requests:
+  #   limits:
   #     nvidia.com/gpu: 8 # single node request
   #   params:
   #     TP: "8"

diff --git a/docs/examples/objstore-oss/model.yaml b/docs/examples/objstore-oss/model.yaml
@@ -11,5 +11,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/sglang/model.yaml b/docs/examples/sglang/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/speculative-decoding/vllm/model.yaml b/docs/examples/speculative-decoding/vllm/model.yaml
@@ -10,7 +10,7 @@ spec:
   inferenceConfig:
     flavors:
       - name: a10 # gpu type
-        requests:
+        limits:
           nvidia.com/gpu: 1
 ---
 apiVersion: llmaz.io/v1alpha1

diff --git a/docs/examples/tgi/model.yaml b/docs/examples/tgi/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go
@@ -201,8 +201,8 @@ func injectModelFlavor(template *corev1.PodTemplateSpec, model *coreapi.OpenMode
 
 	for i, flavor := range model.Spec.InferenceConfig.Flavors {
 		if flavor.Name == flavorName {
-			requests := model.Spec.InferenceConfig.Flavors[i].Requests
-			for k, v := range requests {
+			limits := model.Spec.InferenceConfig.Flavors[i].Limits
+			for k, v := range limits {
 				if container.Resources.Requests == nil {
 					container.Resources.Requests = map[corev1.ResourceName]resource.Quantity{}
 				}

diff --git a/pkg/controller_helper/helper.go b/pkg/controller_helper/helper.go
@@ -121,6 +121,9 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play
 // the second one is whether this is a multi-host inference.
 func MultiHostInference(model *coreapi.OpenModel, playground *inferenceapi.Playground) (int32, bool) {
 	flavors := FirstAssignedFlavor(model, playground)
+	// This is not valid for all cases, like SGLang uses TP for model parallelism.
+	// However, this is not a recommend way since TP requires more communication than PP.
+	// It's ok to support PP only at this moment.
 	if len(flavors) > 0 && flavors[0].Params["PP"] != "" {
 		size, err := strconv.Atoi(flavors[0].Params["PP"])
 		if err != nil {

diff --git a/test/util/validation/validate_service.go b/test/util/validation/validate_service.go
@@ -174,9 +174,9 @@ func ValidateModelFlavor(service *inferenceapi.Service, model *coreapi.OpenModel
 
 	for _, flavor := range model.Spec.InferenceConfig.Flavors {
 		if flavor.Name == flavorName {
-			requests := flavor.Requests
+			limits := flavor.Limits
 			container := workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0]
-			for k, v := range requests {
+			for k, v := range limits {
 				if !container.Resources.Requests[k].Equal(v) {
 					return fmt.Errorf("unexpected request value %v, got %v", v, workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k])
 				}

diff --git a/test/util/wrapper/model.go b/test/util/wrapper/model.go
@@ -133,10 +133,10 @@ func (w *FlavorWrapper) Obj() *coreapi.Flavor {
 }
 
 func (w *FlavorWrapper) SetRequest(r, v string) *FlavorWrapper {
-	if w.Requests == nil {
-		w.Requests = map[v1.ResourceName]resource.Quantity{}
+	if w.Limits == nil {
+		w.Limits = map[v1.ResourceName]resource.Quantity{}
 	}
-	w.Requests[v1.ResourceName(r)] = resource.MustParse(v)
+	w.Limits[v1.ResourceName(r)] = resource.MustParse(v)
 	return w
 }