Merge pull request #224 from kerthcet/feat/fungibility

InftyAI-Agent · web-flow · commit 0c29bbf0afd6 · 2024-12-26T17:50:44.000+08:00
Remove ElasticConfig from Service
diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
@@ -98,14 +98,13 @@ type FlavorName string
 type Flavor struct {
 	// Name represents the flavor name, which will be used in model claim.
 	Name FlavorName `json:"name"`
-	// Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-	// When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-	// 32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-	// nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-	// Not recommended to set the cpu and memory usage here.
-	// If using playground, you can define the cpu/mem usage at backendConfig.
-	// If using service, you can define the cpu/mem at the container resources.
-	// Note: if you define the same accelerator requests at playground/service as well,
+	// Requests defines the required accelerators to serve the model for each replica,
+	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+	// the resource requirements for each replica. This may change in the future.
+	// Not recommended to set the cpu and memory usage here:
+	// - if using playground, you can define the cpu/mem usage at backendConfig.
+	// - if using inference service, you can define the cpu/mem at the container resources.
+	// However, if you define the same accelerator requests at playground/service as well,
 	// the requests here will be covered.
 	// +optional
 	Requests v1.ResourceList `json:"requests,omitempty"`
diff --git a/api/inference/v1alpha1/service_types.go b/api/inference/v1alpha1/service_types.go
@@ -35,11 +35,6 @@ type ServiceSpec struct {
 	// LWS supports both single-host and multi-host scenarios, for single host
 	// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
 	WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
-	// ElasticConfig defines the configuration for elastic usage,
-	// e.g. the max/min replicas. Default to 0 ~ Inf+.
-	// This requires to install the HPA first or will not work.
-	// +optional
-	ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
 }
 
 const (
diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go
diff --git a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go
diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -215,6 +215,27 @@ spec:
                       from the default version.
                     type: string
                 type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas. Default to 0 ~ Inf+.
+                  This requires to install the HPA first or will not work.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to nil means we can scale down the instances to 1.
+                      If minReplicas set to 0, it requires to install serverless component at first.
+                    format: int32
+                    type: integer
+                type: object
               modelClaim:
                 description: |-
                   ModelClaim represents claiming for one model, it's a simplified use case
diff --git a/config/crd/bases/inference.llmaz.io_services.yaml b/config/crd/bases/inference.llmaz.io_services.yaml
@@ -44,27 +44,6 @@ spec:
               Service controller will maintain multi-flavor of workloads with
               different accelerators for cost or performance considerations.
             properties:
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas. Default to 0 ~ Inf+.
-                  This requires to install the HPA first or will not work.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to nil means we can scale down the instances to 1.
-                      If minReplicas set to 0, it requires to install serverless component at first.
-                    format: int32
-                    type: integer
-                type: object
               modelClaims:
                 description: ModelClaims represents multiple claims for different
                   models.
diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml
@@ -86,14 +86,13 @@ spec:
                         pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                         x-kubernetes-int-or-string: true
                       description: |-
-                        Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-                        When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-                        32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-                        nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-                        Not recommended to set the cpu and memory usage here.
-                        If using playground, you can define the cpu/mem usage at backendConfig.
-                        If using service, you can define the cpu/mem at the container resources.
-                        Note: if you define the same accelerator requests at playground/service as well,
+                        Requests defines the required accelerators to serve the model for each replica,
+                        like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+                        the resource requirements for each replica. This may change in the future.
+                        Not recommended to set the cpu and memory usage here:
+                        - if using playground, you can define the cpu/mem usage at backendConfig.
+                        - if using inference service, you can define the cpu/mem at the container resources.
+                        However, if you define the same accelerator requests at playground/service as well,
                         the requests here will be covered.
                       type: object
                   required:
diff --git a/test/util/wrapper/playground.go b/test/util/wrapper/playground.go
@@ -22,6 +22,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
 )
 
 type PlaygroundWrapper struct {
@@ -154,3 +155,11 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper
 	w.Spec.BackendRuntimeConfig.Resources.Limits[v1.ResourceName(r)] = resource.MustParse(v)
 	return w
 }
+
+func (w *PlaygroundWrapper) ElasticConfig(maxReplicas, minReplicas int32) *PlaygroundWrapper {
+	w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
+		MaxReplicas: ptr.To[int32](maxReplicas),
+		MinReplicas: ptr.To[int32](minReplicas),
+	}
+	return w
+}
diff --git a/test/util/wrapper/service.go b/test/util/wrapper/service.go
@@ -19,7 +19,6 @@ package wrapper
 import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/utils/ptr"
 	lws "sigs.k8s.io/lws/api/leaderworkerset/v1"
 
 	coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
@@ -65,14 +64,6 @@ func (w *ServiceWrapper) ModelClaims(modelNames []string, roles []string, flavor
 	return w
 }
 
-func (w *ServiceWrapper) ElasticConfig(maxReplicas, minReplicas int32) *ServiceWrapper {
-	w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
-		MaxReplicas: ptr.To[int32](maxReplicas),
-		MinReplicas: ptr.To[int32](minReplicas),
-	}
-	return w
-}
-
 func (w *ServiceWrapper) WorkerTemplate() *ServiceWrapper {
 	w.Spec.WorkloadTemplate.RolloutStrategy = lws.RolloutStrategy{
 		Type: lws.RollingUpdateStrategyType,