diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go index 8fe6ae93..cf28e637 100644 --- a/api/core/v1alpha1/model_types.go +++ b/api/core/v1alpha1/model_types.go @@ -101,16 +101,16 @@ type FlavorName string type Flavor struct { // Name represents the flavor name, which will be used in model claim. Name FlavorName `json:"name"` - // Requests defines the required accelerators to serve the model for each replica, - // like . For multi-hosts cases, the requests here indicates + // Limits defines the required accelerators to serve the model for each replica, + // like . For multi-hosts cases, the limits here indicates // the resource requirements for each replica, usually equals to the TP size. // Not recommended to set the cpu and memory usage here: // - if using playground, you can define the cpu/mem usage at backendConfig. // - if using inference service, you can define the cpu/mem at the container resources. - // However, if you define the same accelerator requests at playground/service as well, - // the requests will be overwritten by the flavor requests. + // However, if you define the same accelerator resources at playground/service as well, + // the resources will be overwritten by the flavor limit here. // +optional - Requests v1.ResourceList `json:"requests,omitempty"` + Limits v1.ResourceList `json:"limits,omitempty"` // NodeSelector represents the node candidates for Pod placements, if a node doesn't // meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin. // If nodeSelector is empty, it means every node is a candidate. @@ -129,11 +129,15 @@ type Flavor struct { type InferenceConfig struct { // Flavors represents the accelerator requirements to serve the model. // Flavors are fungible following the priority represented by the slice order. + // This is used both in Playground and Inference Service. // +kubebuilder:validation:MaxItems=8 // +optional Flavors []Flavor `json:"flavors,omitempty"` // SharedMemorySize represents the size of /dev/shm required in the runtime of // inference workload. + // This is only used in Playground. Inference Service can configure the shared memory + // directly in PodSpec. + // +optional SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"` } diff --git a/api/core/v1alpha1/zz_generated.deepcopy.go b/api/core/v1alpha1/zz_generated.deepcopy.go index a4f87838..bc91676e 100644 --- a/api/core/v1alpha1/zz_generated.deepcopy.go +++ b/api/core/v1alpha1/zz_generated.deepcopy.go @@ -29,8 +29,8 @@ import ( // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Flavor) DeepCopyInto(out *Flavor) { *out = *in - if in.Requests != nil { - in, out := &in.Requests, &out.Requests + if in.Limits != nil { + in, out := &in.Limits, &out.Limits *out = make(v1.ResourceList, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() @@ -72,6 +72,11 @@ func (in *InferenceConfig) DeepCopyInto(out *InferenceConfig) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.SharedMemorySize != nil { + in, out := &in.SharedMemorySize, &out.SharedMemorySize + x := (*in).DeepCopy() + *out = &x + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceConfig. diff --git a/client-go/applyconfiguration/core/v1alpha1/flavor.go b/client-go/applyconfiguration/core/v1alpha1/flavor.go index b1f609ff..bb72bcad 100644 --- a/client-go/applyconfiguration/core/v1alpha1/flavor.go +++ b/client-go/applyconfiguration/core/v1alpha1/flavor.go @@ -26,7 +26,7 @@ import ( // with apply. type FlavorApplyConfiguration struct { Name *corev1alpha1.FlavorName `json:"name,omitempty"` - Requests *v1.ResourceList `json:"requests,omitempty"` + Limits *v1.ResourceList `json:"limits,omitempty"` NodeSelector map[string]string `json:"nodeSelector,omitempty"` Params map[string]string `json:"params,omitempty"` } @@ -45,11 +45,11 @@ func (b *FlavorApplyConfiguration) WithName(value corev1alpha1.FlavorName) *Flav return b } -// WithRequests sets the Requests field in the declarative configuration to the given value +// WithLimits sets the Limits field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Requests field is set to the value of the last call. -func (b *FlavorApplyConfiguration) WithRequests(value v1.ResourceList) *FlavorApplyConfiguration { - b.Requests = &value +// If called multiple times, the Limits field is set to the value of the last call. +func (b *FlavorApplyConfiguration) WithLimits(value v1.ResourceList) *FlavorApplyConfiguration { + b.Limits = &value return b } diff --git a/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go b/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go index bece2699..6df24c2b 100644 --- a/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go +++ b/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go @@ -17,10 +17,15 @@ limitations under the License. package v1alpha1 +import ( + resource "k8s.io/apimachinery/pkg/api/resource" +) + // InferenceConfigApplyConfiguration represents a declarative configuration of the InferenceConfig type for use // with apply. type InferenceConfigApplyConfiguration struct { - Flavors []FlavorApplyConfiguration `json:"flavors,omitempty"` + Flavors []FlavorApplyConfiguration `json:"flavors,omitempty"` + SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"` } // InferenceConfigApplyConfiguration constructs a declarative configuration of the InferenceConfig type for use with @@ -41,3 +46,11 @@ func (b *InferenceConfigApplyConfiguration) WithFlavors(values ...*FlavorApplyCo } return b } + +// WithSharedMemorySize sets the SharedMemorySize field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the SharedMemorySize field is set to the value of the last call. +func (b *InferenceConfigApplyConfiguration) WithSharedMemorySize(value resource.Quantity) *InferenceConfigApplyConfiguration { + b.SharedMemorySize = &value + return b +} diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml index d72a2f8c..709d9f6d 100644 --- a/config/crd/bases/llmaz.io_openmodels.yaml +++ b/config/crd/bases/llmaz.io_openmodels.yaml @@ -54,6 +54,7 @@ spec: description: |- Flavors represents the accelerator requirements to serve the model. Flavors are fungible following the priority represented by the slice order. + This is used both in Playground and Inference Service. items: description: |- Flavor defines the accelerator requirements for a model and the necessary parameters @@ -61,6 +62,23 @@ spec: - Pod scheduling with node selectors specified. - Cluster autoscaling with essential parameters provided. properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits defines the required accelerators to serve the model for each replica, + like . For multi-hosts cases, the limits here indicates + the resource requirements for each replica, usually equals to the TP size. + Not recommended to set the cpu and memory usage here: + - if using playground, you can define the cpu/mem usage at backendConfig. + - if using inference service, you can define the cpu/mem at the container resources. + However, if you define the same accelerator resources at playground/service as well, + the resources will be overwritten by the flavor limit here. + type: object name: description: Name represents the flavor name, which will be used in model claim. @@ -83,23 +101,6 @@ spec: with for AWS. Preset parameters: TP, PP, INSTANCE-TYPE. type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests defines the required accelerators to serve the model for each replica, - like . For multi-hosts cases, the requests here indicates - the resource requirements for each replica, usually equals to the TP size. - Not recommended to set the cpu and memory usage here: - - if using playground, you can define the cpu/mem usage at backendConfig. - - if using inference service, you can define the cpu/mem at the container resources. - However, if you define the same accelerator requests at playground/service as well, - the requests will be overwritten by the flavor requests. - type: object required: - name type: object @@ -112,6 +113,8 @@ spec: description: |- SharedMemorySize represents the size of /dev/shm required in the runtime of inference workload. + This is only used in Playground. Inference Service can configure the shared memory + directly in PodSpec. pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true type: object diff --git a/docs/examples/hostpath/model.yaml b/docs/examples/hostpath/model.yaml index 294865a7..66830ffd 100644 --- a/docs/examples/hostpath/model.yaml +++ b/docs/examples/hostpath/model.yaml @@ -10,5 +10,5 @@ spec: inferenceConfig: flavors: - name: t4 # GPU type - requests: + limits: nvidia.com/gpu: 1 diff --git a/docs/examples/huggingface/model.yaml b/docs/examples/huggingface/model.yaml index 2d7749f7..7cc24cdc 100644 --- a/docs/examples/huggingface/model.yaml +++ b/docs/examples/huggingface/model.yaml @@ -10,5 +10,5 @@ spec: inferenceConfig: flavors: - name: t4 # GPU type - requests: + limits: nvidia.com/gpu: 1 diff --git a/docs/examples/modelscope/model.yaml b/docs/examples/modelscope/model.yaml index 288b1dd8..c7ed9017 100644 --- a/docs/examples/modelscope/model.yaml +++ b/docs/examples/modelscope/model.yaml @@ -11,5 +11,5 @@ spec: inferenceConfig: flavors: - name: t4 # GPU type - requests: + limits: nvidia.com/gpu: 1 diff --git a/docs/examples/multi-nodes/model.yaml b/docs/examples/multi-nodes/model.yaml index 98736386..cb8fc195 100644 --- a/docs/examples/multi-nodes/model.yaml +++ b/docs/examples/multi-nodes/model.yaml @@ -10,13 +10,13 @@ spec: inferenceConfig: flavors: - name: a100-80gb - requests: + limits: nvidia.com/gpu: 8 # single node request params: TP: "8" # 8 GPUs per node, equal to nvidia.com/gpu PP: "2" # 2 nodes # - name: h100 - # requests: + # limits: # nvidia.com/gpu: 8 # single node request # params: # TP: "8" diff --git a/docs/examples/objstore-oss/model.yaml b/docs/examples/objstore-oss/model.yaml index 1422cee9..a9b03774 100644 --- a/docs/examples/objstore-oss/model.yaml +++ b/docs/examples/objstore-oss/model.yaml @@ -11,5 +11,5 @@ spec: inferenceConfig: flavors: - name: t4 # GPU type - requests: + limits: nvidia.com/gpu: 1 diff --git a/docs/examples/sglang/model.yaml b/docs/examples/sglang/model.yaml index 8da61042..860fe04b 100644 --- a/docs/examples/sglang/model.yaml +++ b/docs/examples/sglang/model.yaml @@ -10,5 +10,5 @@ spec: inferenceConfig: flavors: - name: t4 # GPU type - requests: + limits: nvidia.com/gpu: 1 diff --git a/docs/examples/speculative-decoding/vllm/model.yaml b/docs/examples/speculative-decoding/vllm/model.yaml index 7468fea7..d4ac66c1 100644 --- a/docs/examples/speculative-decoding/vllm/model.yaml +++ b/docs/examples/speculative-decoding/vllm/model.yaml @@ -10,7 +10,7 @@ spec: inferenceConfig: flavors: - name: a10 # gpu type - requests: + limits: nvidia.com/gpu: 1 --- apiVersion: llmaz.io/v1alpha1 diff --git a/docs/examples/tgi/model.yaml b/docs/examples/tgi/model.yaml index 8da61042..860fe04b 100644 --- a/docs/examples/tgi/model.yaml +++ b/docs/examples/tgi/model.yaml @@ -10,5 +10,5 @@ spec: inferenceConfig: flavors: - name: t4 # GPU type - requests: + limits: nvidia.com/gpu: 1 diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go index 24aaa93b..3aae6e8e 100644 --- a/pkg/controller/inference/service_controller.go +++ b/pkg/controller/inference/service_controller.go @@ -201,8 +201,8 @@ func injectModelFlavor(template *corev1.PodTemplateSpec, model *coreapi.OpenMode for i, flavor := range model.Spec.InferenceConfig.Flavors { if flavor.Name == flavorName { - requests := model.Spec.InferenceConfig.Flavors[i].Requests - for k, v := range requests { + limits := model.Spec.InferenceConfig.Flavors[i].Limits + for k, v := range limits { if container.Resources.Requests == nil { container.Resources.Requests = map[corev1.ResourceName]resource.Quantity{} } diff --git a/pkg/controller_helper/helper.go b/pkg/controller_helper/helper.go index 0b33123b..99cad552 100644 --- a/pkg/controller_helper/helper.go +++ b/pkg/controller_helper/helper.go @@ -121,6 +121,9 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play // the second one is whether this is a multi-host inference. func MultiHostInference(model *coreapi.OpenModel, playground *inferenceapi.Playground) (int32, bool) { flavors := FirstAssignedFlavor(model, playground) + // This is not valid for all cases, like SGLang uses TP for model parallelism. + // However, this is not a recommend way since TP requires more communication than PP. + // It's ok to support PP only at this moment. if len(flavors) > 0 && flavors[0].Params["PP"] != "" { size, err := strconv.Atoi(flavors[0].Params["PP"]) if err != nil { diff --git a/test/util/validation/validate_service.go b/test/util/validation/validate_service.go index 81eb6193..aaf65236 100644 --- a/test/util/validation/validate_service.go +++ b/test/util/validation/validate_service.go @@ -174,9 +174,9 @@ func ValidateModelFlavor(service *inferenceapi.Service, model *coreapi.OpenModel for _, flavor := range model.Spec.InferenceConfig.Flavors { if flavor.Name == flavorName { - requests := flavor.Requests + limits := flavor.Limits container := workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0] - for k, v := range requests { + for k, v := range limits { if !container.Resources.Requests[k].Equal(v) { return fmt.Errorf("unexpected request value %v, got %v", v, workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k]) } diff --git a/test/util/wrapper/model.go b/test/util/wrapper/model.go index 11e930b8..a0a4ebab 100644 --- a/test/util/wrapper/model.go +++ b/test/util/wrapper/model.go @@ -133,10 +133,10 @@ func (w *FlavorWrapper) Obj() *coreapi.Flavor { } func (w *FlavorWrapper) SetRequest(r, v string) *FlavorWrapper { - if w.Requests == nil { - w.Requests = map[v1.ResourceName]resource.Quantity{} + if w.Limits == nil { + w.Limits = map[v1.ResourceName]resource.Quantity{} } - w.Requests[v1.ResourceName(r)] = resource.MustParse(v) + w.Limits[v1.ResourceName(r)] = resource.MustParse(v) return w }