diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
index 8fe6ae93..cf28e637 100644
--- a/api/core/v1alpha1/model_types.go
+++ b/api/core/v1alpha1/model_types.go
@@ -101,16 +101,16 @@ type FlavorName string
 type Flavor struct {
 	// Name represents the flavor name, which will be used in model claim.
 	Name FlavorName `json:"name"`
-	// Requests defines the required accelerators to serve the model for each replica,
-	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+	// Limits defines the required accelerators to serve the model for each replica,
+	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
 	// the resource requirements for each replica, usually equals to the TP size.
 	// Not recommended to set the cpu and memory usage here:
 	// - if using playground, you can define the cpu/mem usage at backendConfig.
 	// - if using inference service, you can define the cpu/mem at the container resources.
-	// However, if you define the same accelerator requests at playground/service as well,
-	// the requests will be overwritten by the flavor requests.
+	// However, if you define the same accelerator resources at playground/service as well,
+	// the resources will be overwritten by the flavor limit here.
 	// +optional
-	Requests v1.ResourceList `json:"requests,omitempty"`
+	Limits v1.ResourceList `json:"limits,omitempty"`
 	// NodeSelector represents the node candidates for Pod placements, if a node doesn't
 	// meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
 	// If nodeSelector is empty, it means every node is a candidate.
@@ -129,11 +129,15 @@ type Flavor struct {
 type InferenceConfig struct {
 	// Flavors represents the accelerator requirements to serve the model.
 	// Flavors are fungible following the priority represented by the slice order.
+	// This is used both in Playground and Inference Service.
 	// +kubebuilder:validation:MaxItems=8
 	// +optional
 	Flavors []Flavor `json:"flavors,omitempty"`
 	// SharedMemorySize represents the size of /dev/shm required in the runtime of
 	// inference workload.
+	// This is only used in Playground. Inference Service can configure the shared memory
+	// directly in PodSpec.
+	// +optional
 	SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
 }
 
diff --git a/api/core/v1alpha1/zz_generated.deepcopy.go b/api/core/v1alpha1/zz_generated.deepcopy.go
index a4f87838..bc91676e 100644
--- a/api/core/v1alpha1/zz_generated.deepcopy.go
+++ b/api/core/v1alpha1/zz_generated.deepcopy.go
@@ -29,8 +29,8 @@ import (
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *Flavor) DeepCopyInto(out *Flavor) {
 	*out = *in
-	if in.Requests != nil {
-		in, out := &in.Requests, &out.Requests
+	if in.Limits != nil {
+		in, out := &in.Limits, &out.Limits
 		*out = make(v1.ResourceList, len(*in))
 		for key, val := range *in {
 			(*out)[key] = val.DeepCopy()
@@ -72,6 +72,11 @@ func (in *InferenceConfig) DeepCopyInto(out *InferenceConfig) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.SharedMemorySize != nil {
+		in, out := &in.SharedMemorySize, &out.SharedMemorySize
+		x := (*in).DeepCopy()
+		*out = &x
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceConfig.
diff --git a/client-go/applyconfiguration/core/v1alpha1/flavor.go b/client-go/applyconfiguration/core/v1alpha1/flavor.go
index b1f609ff..bb72bcad 100644
--- a/client-go/applyconfiguration/core/v1alpha1/flavor.go
+++ b/client-go/applyconfiguration/core/v1alpha1/flavor.go
@@ -26,7 +26,7 @@ import (
 // with apply.
 type FlavorApplyConfiguration struct {
 	Name         *corev1alpha1.FlavorName `json:"name,omitempty"`
-	Requests     *v1.ResourceList         `json:"requests,omitempty"`
+	Limits       *v1.ResourceList         `json:"limits,omitempty"`
 	NodeSelector map[string]string        `json:"nodeSelector,omitempty"`
 	Params       map[string]string        `json:"params,omitempty"`
 }
@@ -45,11 +45,11 @@ func (b *FlavorApplyConfiguration) WithName(value corev1alpha1.FlavorName) *Flav
 	return b
 }
 
-// WithRequests sets the Requests field in the declarative configuration to the given value
+// WithLimits sets the Limits field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
-// If called multiple times, the Requests field is set to the value of the last call.
-func (b *FlavorApplyConfiguration) WithRequests(value v1.ResourceList) *FlavorApplyConfiguration {
-	b.Requests = &value
+// If called multiple times, the Limits field is set to the value of the last call.
+func (b *FlavorApplyConfiguration) WithLimits(value v1.ResourceList) *FlavorApplyConfiguration {
+	b.Limits = &value
 	return b
 }
 
diff --git a/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go b/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go
index bece2699..6df24c2b 100644
--- a/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go
+++ b/client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go
@@ -17,10 +17,15 @@ limitations under the License.
 
 package v1alpha1
 
+import (
+	resource "k8s.io/apimachinery/pkg/api/resource"
+)
+
 // InferenceConfigApplyConfiguration represents a declarative configuration of the InferenceConfig type for use
 // with apply.
 type InferenceConfigApplyConfiguration struct {
-	Flavors []FlavorApplyConfiguration `json:"flavors,omitempty"`
+	Flavors          []FlavorApplyConfiguration `json:"flavors,omitempty"`
+	SharedMemorySize *resource.Quantity         `json:"sharedMemorySize,omitempty"`
 }
 
 // InferenceConfigApplyConfiguration constructs a declarative configuration of the InferenceConfig type for use with
@@ -41,3 +46,11 @@ func (b *InferenceConfigApplyConfiguration) WithFlavors(values ...*FlavorApplyCo
 	}
 	return b
 }
+
+// WithSharedMemorySize sets the SharedMemorySize field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the SharedMemorySize field is set to the value of the last call.
+func (b *InferenceConfigApplyConfiguration) WithSharedMemorySize(value resource.Quantity) *InferenceConfigApplyConfiguration {
+	b.SharedMemorySize = &value
+	return b
+}
diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml
index d72a2f8c..709d9f6d 100644
--- a/config/crd/bases/llmaz.io_openmodels.yaml
+++ b/config/crd/bases/llmaz.io_openmodels.yaml
@@ -54,6 +54,7 @@ spec:
                     description: |-
                       Flavors represents the accelerator requirements to serve the model.
                       Flavors are fungible following the priority represented by the slice order.
+                      This is used both in Playground and Inference Service.
                     items:
                       description: |-
                         Flavor defines the accelerator requirements for a model and the necessary parameters
@@ -61,6 +62,23 @@ spec:
                         - Pod scheduling with node selectors specified.
                         - Cluster autoscaling with essential parameters provided.
                       properties:
+                        limits:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: |-
+                            Limits defines the required accelerators to serve the model for each replica,
+                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
+                            the resource requirements for each replica, usually equals to the TP size.
+                            Not recommended to set the cpu and memory usage here:
+                            - if using playground, you can define the cpu/mem usage at backendConfig.
+                            - if using inference service, you can define the cpu/mem at the container resources.
+                            However, if you define the same accelerator resources at playground/service as well,
+                            the resources will be overwritten by the flavor limit here.
+                          type: object
                         name:
                           description: Name represents the flavor name, which will
                             be used in model claim.
@@ -83,23 +101,6 @@ spec:
                             with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
                             Preset parameters: TP, PP, INSTANCE-TYPE.
                           type: object
-                        requests:
-                          additionalProperties:
-                            anyOf:
-                            - type: integer
-                            - type: string
-                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                            x-kubernetes-int-or-string: true
-                          description: |-
-                            Requests defines the required accelerators to serve the model for each replica,
-                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
-                            the resource requirements for each replica, usually equals to the TP size.
-                            Not recommended to set the cpu and memory usage here:
-                            - if using playground, you can define the cpu/mem usage at backendConfig.
-                            - if using inference service, you can define the cpu/mem at the container resources.
-                            However, if you define the same accelerator requests at playground/service as well,
-                            the requests will be overwritten by the flavor requests.
-                          type: object
                       required:
                       - name
                       type: object
@@ -112,6 +113,8 @@ spec:
                     description: |-
                       SharedMemorySize represents the size of /dev/shm required in the runtime of
                       inference workload.
+                      This is only used in Playground. Inference Service can configure the shared memory
+                      directly in PodSpec.
                     pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                     x-kubernetes-int-or-string: true
                 type: object
diff --git a/docs/examples/hostpath/model.yaml b/docs/examples/hostpath/model.yaml
index 294865a7..66830ffd 100644
--- a/docs/examples/hostpath/model.yaml
+++ b/docs/examples/hostpath/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/huggingface/model.yaml b/docs/examples/huggingface/model.yaml
index 2d7749f7..7cc24cdc 100644
--- a/docs/examples/huggingface/model.yaml
+++ b/docs/examples/huggingface/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/modelscope/model.yaml b/docs/examples/modelscope/model.yaml
index 288b1dd8..c7ed9017 100644
--- a/docs/examples/modelscope/model.yaml
+++ b/docs/examples/modelscope/model.yaml
@@ -11,5 +11,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/multi-nodes/model.yaml b/docs/examples/multi-nodes/model.yaml
index 98736386..cb8fc195 100644
--- a/docs/examples/multi-nodes/model.yaml
+++ b/docs/examples/multi-nodes/model.yaml
@@ -10,13 +10,13 @@ spec:
   inferenceConfig:
     flavors:
       - name: a100-80gb
-        requests:
+        limits:
           nvidia.com/gpu: 8 # single node request
         params:
           TP: "8" # 8 GPUs per node, equal to nvidia.com/gpu
           PP: "2" # 2 nodes
   # - name: h100
-  #   requests:
+  #   limits:
   #     nvidia.com/gpu: 8 # single node request
   #   params:
   #     TP: "8"
diff --git a/docs/examples/objstore-oss/model.yaml b/docs/examples/objstore-oss/model.yaml
index 1422cee9..a9b03774 100644
--- a/docs/examples/objstore-oss/model.yaml
+++ b/docs/examples/objstore-oss/model.yaml
@@ -11,5 +11,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/sglang/model.yaml b/docs/examples/sglang/model.yaml
index 8da61042..860fe04b 100644
--- a/docs/examples/sglang/model.yaml
+++ b/docs/examples/sglang/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/docs/examples/speculative-decoding/vllm/model.yaml b/docs/examples/speculative-decoding/vllm/model.yaml
index 7468fea7..d4ac66c1 100644
--- a/docs/examples/speculative-decoding/vllm/model.yaml
+++ b/docs/examples/speculative-decoding/vllm/model.yaml
@@ -10,7 +10,7 @@ spec:
   inferenceConfig:
     flavors:
       - name: a10 # gpu type
-        requests:
+        limits:
           nvidia.com/gpu: 1
 ---
 apiVersion: llmaz.io/v1alpha1
diff --git a/docs/examples/tgi/model.yaml b/docs/examples/tgi/model.yaml
index 8da61042..860fe04b 100644
--- a/docs/examples/tgi/model.yaml
+++ b/docs/examples/tgi/model.yaml
@@ -10,5 +10,5 @@ spec:
   inferenceConfig:
     flavors:
       - name: t4 # GPU type
-        requests:
+        limits:
           nvidia.com/gpu: 1
diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go
index 24aaa93b..3aae6e8e 100644
--- a/pkg/controller/inference/service_controller.go
+++ b/pkg/controller/inference/service_controller.go
@@ -201,8 +201,8 @@ func injectModelFlavor(template *corev1.PodTemplateSpec, model *coreapi.OpenMode
 
 	for i, flavor := range model.Spec.InferenceConfig.Flavors {
 		if flavor.Name == flavorName {
-			requests := model.Spec.InferenceConfig.Flavors[i].Requests
-			for k, v := range requests {
+			limits := model.Spec.InferenceConfig.Flavors[i].Limits
+			for k, v := range limits {
 				if container.Resources.Requests == nil {
 					container.Resources.Requests = map[corev1.ResourceName]resource.Quantity{}
 				}
diff --git a/pkg/controller_helper/helper.go b/pkg/controller_helper/helper.go
index 0b33123b..99cad552 100644
--- a/pkg/controller_helper/helper.go
+++ b/pkg/controller_helper/helper.go
@@ -121,6 +121,9 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play
 // the second one is whether this is a multi-host inference.
 func MultiHostInference(model *coreapi.OpenModel, playground *inferenceapi.Playground) (int32, bool) {
 	flavors := FirstAssignedFlavor(model, playground)
+	// This is not valid for all cases, like SGLang uses TP for model parallelism.
+	// However, this is not a recommend way since TP requires more communication than PP.
+	// It's ok to support PP only at this moment.
 	if len(flavors) > 0 && flavors[0].Params["PP"] != "" {
 		size, err := strconv.Atoi(flavors[0].Params["PP"])
 		if err != nil {
diff --git a/test/util/validation/validate_service.go b/test/util/validation/validate_service.go
index 81eb6193..aaf65236 100644
--- a/test/util/validation/validate_service.go
+++ b/test/util/validation/validate_service.go
@@ -174,9 +174,9 @@ func ValidateModelFlavor(service *inferenceapi.Service, model *coreapi.OpenModel
 
 	for _, flavor := range model.Spec.InferenceConfig.Flavors {
 		if flavor.Name == flavorName {
-			requests := flavor.Requests
+			limits := flavor.Limits
 			container := workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0]
-			for k, v := range requests {
+			for k, v := range limits {
 				if !container.Resources.Requests[k].Equal(v) {
 					return fmt.Errorf("unexpected request value %v, got %v", v, workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k])
 				}
diff --git a/test/util/wrapper/model.go b/test/util/wrapper/model.go
index 11e930b8..a0a4ebab 100644
--- a/test/util/wrapper/model.go
+++ b/test/util/wrapper/model.go
@@ -133,10 +133,10 @@ func (w *FlavorWrapper) Obj() *coreapi.Flavor {
 }
 
 func (w *FlavorWrapper) SetRequest(r, v string) *FlavorWrapper {
-	if w.Requests == nil {
-		w.Requests = map[v1.ResourceName]resource.Quantity{}
+	if w.Limits == nil {
+		w.Limits = map[v1.ResourceName]resource.Quantity{}
 	}
-	w.Requests[v1.ResourceName(r)] = resource.MustParse(v)
+	w.Limits[v1.ResourceName(r)] = resource.MustParse(v)
 	return w
 }