From 70b4a3ba4a440dea53b7bbcd0dc787e7aba18391 Mon Sep 17 00:00:00 2001
From: kerthcet <kerthcet@gmail.com>
Date: Thu, 26 Dec 2024 17:34:18 +0800
Subject: [PATCH] Remove ElasticConfig from Service

Signed-off-by: kerthcet <kerthcet@gmail.com>
---
 api/core/v1alpha1/model_types.go              | 15 +++++++------
 api/inference/v1alpha1/service_types.go       |  5 -----
 .../v1alpha1/zz_generated.deepcopy.go         |  5 -----
 .../inference/v1alpha1/servicespec.go         |  9 --------
 .../bases/inference.llmaz.io_playgrounds.yaml | 21 +++++++++++++++++++
 .../bases/inference.llmaz.io_services.yaml    | 21 -------------------
 config/crd/bases/llmaz.io_openmodels.yaml     | 15 +++++++------
 test/util/wrapper/playground.go               |  9 ++++++++
 test/util/wrapper/service.go                  |  9 --------
 9 files changed, 44 insertions(+), 65 deletions(-)

diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
index cebede17..abb72c29 100644
--- a/api/core/v1alpha1/model_types.go
+++ b/api/core/v1alpha1/model_types.go
@@ -98,14 +98,13 @@ type FlavorName string
 type Flavor struct {
 	// Name represents the flavor name, which will be used in model claim.
 	Name FlavorName `json:"name"`
-	// Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-	// When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-	// 32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-	// nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-	// Not recommended to set the cpu and memory usage here.
-	// If using playground, you can define the cpu/mem usage at backendConfig.
-	// If using service, you can define the cpu/mem at the container resources.
-	// Note: if you define the same accelerator requests at playground/service as well,
+	// Requests defines the required accelerators to serve the model for each replica,
+	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+	// the resource requirements for each replica. This may change in the future.
+	// Not recommended to set the cpu and memory usage here:
+	// - if using playground, you can define the cpu/mem usage at backendConfig.
+	// - if using inference service, you can define the cpu/mem at the container resources.
+	// However, if you define the same accelerator requests at playground/service as well,
 	// the requests here will be covered.
 	// +optional
 	Requests v1.ResourceList `json:"requests,omitempty"`
diff --git a/api/inference/v1alpha1/service_types.go b/api/inference/v1alpha1/service_types.go
index 7de6087d..f12d531d 100644
--- a/api/inference/v1alpha1/service_types.go
+++ b/api/inference/v1alpha1/service_types.go
@@ -35,11 +35,6 @@ type ServiceSpec struct {
 	// LWS supports both single-host and multi-host scenarios, for single host
 	// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
 	WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
-	// ElasticConfig defines the configuration for elastic usage,
-	// e.g. the max/min replicas. Default to 0 ~ Inf+.
-	// This requires to install the HPA first or will not work.
-	// +optional
-	ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
 }
 
 const (
diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go
index 917f0541..41263081 100644
--- a/api/inference/v1alpha1/zz_generated.deepcopy.go
+++ b/api/inference/v1alpha1/zz_generated.deepcopy.go
@@ -444,11 +444,6 @@ func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) {
 	*out = *in
 	in.ModelClaims.DeepCopyInto(&out.ModelClaims)
 	in.WorkloadTemplate.DeepCopyInto(&out.WorkloadTemplate)
-	if in.ElasticConfig != nil {
-		in, out := &in.ElasticConfig, &out.ElasticConfig
-		*out = new(ElasticConfig)
-		(*in).DeepCopyInto(*out)
-	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceSpec.
diff --git a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go
index a8cb5617..2666c01b 100644
--- a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go
+++ b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go
@@ -27,7 +27,6 @@ import (
 type ServiceSpecApplyConfiguration struct {
 	ModelClaims      *v1alpha1.ModelClaimsApplyConfiguration `json:"modelClaims,omitempty"`
 	WorkloadTemplate *v1.LeaderWorkerSetSpec                 `json:"workloadTemplate,omitempty"`
-	ElasticConfig    *ElasticConfigApplyConfiguration        `json:"elasticConfig,omitempty"`
 }
 
 // ServiceSpecApplyConfiguration constructs a declarative configuration of the ServiceSpec type for use with
@@ -51,11 +50,3 @@ func (b *ServiceSpecApplyConfiguration) WithWorkloadTemplate(value v1.LeaderWork
 	b.WorkloadTemplate = &value
 	return b
 }
-
-// WithElasticConfig sets the ElasticConfig field in the declarative configuration to the given value
-// and returns the receiver, so that objects can be built by chaining "With" function invocations.
-// If called multiple times, the ElasticConfig field is set to the value of the last call.
-func (b *ServiceSpecApplyConfiguration) WithElasticConfig(value *ElasticConfigApplyConfiguration) *ServiceSpecApplyConfiguration {
-	b.ElasticConfig = value
-	return b
-}
diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
index 8278e728..d4d6f480 100644
--- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml
+++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -215,6 +215,27 @@ spec:
                       from the default version.
                     type: string
                 type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas. Default to 0 ~ Inf+.
+                  This requires to install the HPA first or will not work.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to nil means we can scale down the instances to 1.
+                      If minReplicas set to 0, it requires to install serverless component at first.
+                    format: int32
+                    type: integer
+                type: object
               modelClaim:
                 description: |-
                   ModelClaim represents claiming for one model, it's a simplified use case
diff --git a/config/crd/bases/inference.llmaz.io_services.yaml b/config/crd/bases/inference.llmaz.io_services.yaml
index 0a86a17f..b788437d 100644
--- a/config/crd/bases/inference.llmaz.io_services.yaml
+++ b/config/crd/bases/inference.llmaz.io_services.yaml
@@ -44,27 +44,6 @@ spec:
               Service controller will maintain multi-flavor of workloads with
               different accelerators for cost or performance considerations.
             properties:
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas. Default to 0 ~ Inf+.
-                  This requires to install the HPA first or will not work.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to nil means we can scale down the instances to 1.
-                      If minReplicas set to 0, it requires to install serverless component at first.
-                    format: int32
-                    type: integer
-                type: object
               modelClaims:
                 description: ModelClaims represents multiple claims for different
                   models.
diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml
index b26622ba..803efc33 100644
--- a/config/crd/bases/llmaz.io_openmodels.yaml
+++ b/config/crd/bases/llmaz.io_openmodels.yaml
@@ -86,14 +86,13 @@ spec:
                         pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                         x-kubernetes-int-or-string: true
                       description: |-
-                        Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-                        When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-                        32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-                        nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-                        Not recommended to set the cpu and memory usage here.
-                        If using playground, you can define the cpu/mem usage at backendConfig.
-                        If using service, you can define the cpu/mem at the container resources.
-                        Note: if you define the same accelerator requests at playground/service as well,
+                        Requests defines the required accelerators to serve the model for each replica,
+                        like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+                        the resource requirements for each replica. This may change in the future.
+                        Not recommended to set the cpu and memory usage here:
+                        - if using playground, you can define the cpu/mem usage at backendConfig.
+                        - if using inference service, you can define the cpu/mem at the container resources.
+                        However, if you define the same accelerator requests at playground/service as well,
                         the requests here will be covered.
                       type: object
                   required:
diff --git a/test/util/wrapper/playground.go b/test/util/wrapper/playground.go
index 54aabab2..b053e076 100644
--- a/test/util/wrapper/playground.go
+++ b/test/util/wrapper/playground.go
@@ -22,6 +22,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
 )
 
 type PlaygroundWrapper struct {
@@ -154,3 +155,11 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper
 	w.Spec.BackendRuntimeConfig.Resources.Limits[v1.ResourceName(r)] = resource.MustParse(v)
 	return w
 }
+
+func (w *PlaygroundWrapper) ElasticConfig(maxReplicas, minReplicas int32) *PlaygroundWrapper {
+	w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
+		MaxReplicas: ptr.To[int32](maxReplicas),
+		MinReplicas: ptr.To[int32](minReplicas),
+	}
+	return w
+}
diff --git a/test/util/wrapper/service.go b/test/util/wrapper/service.go
index 37b6268e..3f89c3f7 100644
--- a/test/util/wrapper/service.go
+++ b/test/util/wrapper/service.go
@@ -19,7 +19,6 @@ package wrapper
 import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/utils/ptr"
 	lws "sigs.k8s.io/lws/api/leaderworkerset/v1"
 
 	coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
@@ -65,14 +64,6 @@ func (w *ServiceWrapper) ModelClaims(modelNames []string, roles []string, flavor
 	return w
 }
 
-func (w *ServiceWrapper) ElasticConfig(maxReplicas, minReplicas int32) *ServiceWrapper {
-	w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
-		MaxReplicas: ptr.To[int32](maxReplicas),
-		MinReplicas: ptr.To[int32](minReplicas),
-	}
-	return w
-}
-
 func (w *ServiceWrapper) WorkerTemplate() *ServiceWrapper {
 	w.Spec.WorkloadTemplate.RolloutStrategy = lws.RolloutStrategy{
 		Type: lws.RollingUpdateStrategyType,