From 70b4a3ba4a440dea53b7bbcd0dc787e7aba18391 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Thu, 26 Dec 2024 17:34:18 +0800 Subject: [PATCH] Remove ElasticConfig from Service Signed-off-by: kerthcet --- api/core/v1alpha1/model_types.go | 15 +++++++------ api/inference/v1alpha1/service_types.go | 5 ----- .../v1alpha1/zz_generated.deepcopy.go | 5 ----- .../inference/v1alpha1/servicespec.go | 9 -------- .../bases/inference.llmaz.io_playgrounds.yaml | 21 +++++++++++++++++++ .../bases/inference.llmaz.io_services.yaml | 21 ------------------- config/crd/bases/llmaz.io_openmodels.yaml | 15 +++++++------ test/util/wrapper/playground.go | 9 ++++++++ test/util/wrapper/service.go | 9 -------- 9 files changed, 44 insertions(+), 65 deletions(-) diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go index cebede17..abb72c29 100644 --- a/api/core/v1alpha1/model_types.go +++ b/api/core/v1alpha1/model_types.go @@ -98,14 +98,13 @@ type FlavorName string type Flavor struct { // Name represents the flavor name, which will be used in model claim. Name FlavorName `json:"name"` - // Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8. - // When GPU number is greater than 8, like 32, then multi-host inference is enabled and - // 32/8=4 hosts will be grouped as an unit, each host will have a resource request as - // nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken. - // Not recommended to set the cpu and memory usage here. - // If using playground, you can define the cpu/mem usage at backendConfig. - // If using service, you can define the cpu/mem at the container resources. - // Note: if you define the same accelerator requests at playground/service as well, + // Requests defines the required accelerators to serve the model for each replica, + // like . For multi-hosts cases, the requests here indicates + // the resource requirements for each replica. This may change in the future. + // Not recommended to set the cpu and memory usage here: + // - if using playground, you can define the cpu/mem usage at backendConfig. + // - if using inference service, you can define the cpu/mem at the container resources. + // However, if you define the same accelerator requests at playground/service as well, // the requests here will be covered. // +optional Requests v1.ResourceList `json:"requests,omitempty"` diff --git a/api/inference/v1alpha1/service_types.go b/api/inference/v1alpha1/service_types.go index 7de6087d..f12d531d 100644 --- a/api/inference/v1alpha1/service_types.go +++ b/api/inference/v1alpha1/service_types.go @@ -35,11 +35,6 @@ type ServiceSpec struct { // LWS supports both single-host and multi-host scenarios, for single host // cases, only need to care about replicas, rolloutStrategy and workerTemplate. WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"` - // ElasticConfig defines the configuration for elastic usage, - // e.g. the max/min replicas. Default to 0 ~ Inf+. - // This requires to install the HPA first or will not work. - // +optional - ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"` } const ( diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go index 917f0541..41263081 100644 --- a/api/inference/v1alpha1/zz_generated.deepcopy.go +++ b/api/inference/v1alpha1/zz_generated.deepcopy.go @@ -444,11 +444,6 @@ func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) { *out = *in in.ModelClaims.DeepCopyInto(&out.ModelClaims) in.WorkloadTemplate.DeepCopyInto(&out.WorkloadTemplate) - if in.ElasticConfig != nil { - in, out := &in.ElasticConfig, &out.ElasticConfig - *out = new(ElasticConfig) - (*in).DeepCopyInto(*out) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceSpec. diff --git a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go index a8cb5617..2666c01b 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go +++ b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go @@ -27,7 +27,6 @@ import ( type ServiceSpecApplyConfiguration struct { ModelClaims *v1alpha1.ModelClaimsApplyConfiguration `json:"modelClaims,omitempty"` WorkloadTemplate *v1.LeaderWorkerSetSpec `json:"workloadTemplate,omitempty"` - ElasticConfig *ElasticConfigApplyConfiguration `json:"elasticConfig,omitempty"` } // ServiceSpecApplyConfiguration constructs a declarative configuration of the ServiceSpec type for use with @@ -51,11 +50,3 @@ func (b *ServiceSpecApplyConfiguration) WithWorkloadTemplate(value v1.LeaderWork b.WorkloadTemplate = &value return b } - -// WithElasticConfig sets the ElasticConfig field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ElasticConfig field is set to the value of the last call. -func (b *ServiceSpecApplyConfiguration) WithElasticConfig(value *ElasticConfigApplyConfiguration) *ServiceSpecApplyConfiguration { - b.ElasticConfig = value - return b -} diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml index 8278e728..d4d6f480 100644 --- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml +++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml @@ -215,6 +215,27 @@ spec: from the default version. type: string type: object + elasticConfig: + description: |- + ElasticConfig defines the configuration for elastic usage, + e.g. the max/min replicas. Default to 0 ~ Inf+. + This requires to install the HPA first or will not work. + properties: + maxReplicas: + description: |- + MaxReplicas indicates the maximum number of inference workloads based on the traffic. + Default to nil means there's no limit for the instance number. + format: int32 + type: integer + minReplicas: + default: 1 + description: |- + MinReplicas indicates the minimum number of inference workloads based on the traffic. + Default to nil means we can scale down the instances to 1. + If minReplicas set to 0, it requires to install serverless component at first. + format: int32 + type: integer + type: object modelClaim: description: |- ModelClaim represents claiming for one model, it's a simplified use case diff --git a/config/crd/bases/inference.llmaz.io_services.yaml b/config/crd/bases/inference.llmaz.io_services.yaml index 0a86a17f..b788437d 100644 --- a/config/crd/bases/inference.llmaz.io_services.yaml +++ b/config/crd/bases/inference.llmaz.io_services.yaml @@ -44,27 +44,6 @@ spec: Service controller will maintain multi-flavor of workloads with different accelerators for cost or performance considerations. properties: - elasticConfig: - description: |- - ElasticConfig defines the configuration for elastic usage, - e.g. the max/min replicas. Default to 0 ~ Inf+. - This requires to install the HPA first or will not work. - properties: - maxReplicas: - description: |- - MaxReplicas indicates the maximum number of inference workloads based on the traffic. - Default to nil means there's no limit for the instance number. - format: int32 - type: integer - minReplicas: - default: 1 - description: |- - MinReplicas indicates the minimum number of inference workloads based on the traffic. - Default to nil means we can scale down the instances to 1. - If minReplicas set to 0, it requires to install serverless component at first. - format: int32 - type: integer - type: object modelClaims: description: ModelClaims represents multiple claims for different models. diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml index b26622ba..803efc33 100644 --- a/config/crd/bases/llmaz.io_openmodels.yaml +++ b/config/crd/bases/llmaz.io_openmodels.yaml @@ -86,14 +86,13 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- - Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8. - When GPU number is greater than 8, like 32, then multi-host inference is enabled and - 32/8=4 hosts will be grouped as an unit, each host will have a resource request as - nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken. - Not recommended to set the cpu and memory usage here. - If using playground, you can define the cpu/mem usage at backendConfig. - If using service, you can define the cpu/mem at the container resources. - Note: if you define the same accelerator requests at playground/service as well, + Requests defines the required accelerators to serve the model for each replica, + like . For multi-hosts cases, the requests here indicates + the resource requirements for each replica. This may change in the future. + Not recommended to set the cpu and memory usage here: + - if using playground, you can define the cpu/mem usage at backendConfig. + - if using inference service, you can define the cpu/mem at the container resources. + However, if you define the same accelerator requests at playground/service as well, the requests here will be covered. type: object required: diff --git a/test/util/wrapper/playground.go b/test/util/wrapper/playground.go index 54aabab2..b053e076 100644 --- a/test/util/wrapper/playground.go +++ b/test/util/wrapper/playground.go @@ -22,6 +22,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" ) type PlaygroundWrapper struct { @@ -154,3 +155,11 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper w.Spec.BackendRuntimeConfig.Resources.Limits[v1.ResourceName(r)] = resource.MustParse(v) return w } + +func (w *PlaygroundWrapper) ElasticConfig(maxReplicas, minReplicas int32) *PlaygroundWrapper { + w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{ + MaxReplicas: ptr.To[int32](maxReplicas), + MinReplicas: ptr.To[int32](minReplicas), + } + return w +} diff --git a/test/util/wrapper/service.go b/test/util/wrapper/service.go index 37b6268e..3f89c3f7 100644 --- a/test/util/wrapper/service.go +++ b/test/util/wrapper/service.go @@ -19,7 +19,6 @@ package wrapper import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/utils/ptr" lws "sigs.k8s.io/lws/api/leaderworkerset/v1" coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" @@ -65,14 +64,6 @@ func (w *ServiceWrapper) ModelClaims(modelNames []string, roles []string, flavor return w } -func (w *ServiceWrapper) ElasticConfig(maxReplicas, minReplicas int32) *ServiceWrapper { - w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{ - MaxReplicas: ptr.To[int32](maxReplicas), - MinReplicas: ptr.To[int32](minReplicas), - } - return w -} - func (w *ServiceWrapper) WorkerTemplate() *ServiceWrapper { w.Spec.WorkloadTemplate.RolloutStrategy = lws.RolloutStrategy{ Type: lws.RollingUpdateStrategyType,