Skip to content

Commit 0c29bbf

Browse files
Merge pull request #224 from kerthcet/feat/fungibility
Remove ElasticConfig from Service
2 parents 590d58d + 70b4a3b commit 0c29bbf

File tree

9 files changed

+44
-65
lines changed

9 files changed

+44
-65
lines changed

api/core/v1alpha1/model_types.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,13 @@ type FlavorName string
9898
type Flavor struct {
9999
// Name represents the flavor name, which will be used in model claim.
100100
Name FlavorName `json:"name"`
101-
// Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
102-
// When GPU number is greater than 8, like 32, then multi-host inference is enabled and
103-
// 32/8=4 hosts will be grouped as an unit, each host will have a resource request as
104-
// nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
105-
// Not recommended to set the cpu and memory usage here.
106-
// If using playground, you can define the cpu/mem usage at backendConfig.
107-
// If using service, you can define the cpu/mem at the container resources.
108-
// Note: if you define the same accelerator requests at playground/service as well,
101+
// Requests defines the required accelerators to serve the model for each replica,
102+
// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
103+
// the resource requirements for each replica. This may change in the future.
104+
// Not recommended to set the cpu and memory usage here:
105+
// - if using playground, you can define the cpu/mem usage at backendConfig.
106+
// - if using inference service, you can define the cpu/mem at the container resources.
107+
// However, if you define the same accelerator requests at playground/service as well,
109108
// the requests here will be covered.
110109
// +optional
111110
Requests v1.ResourceList `json:"requests,omitempty"`

api/inference/v1alpha1/service_types.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@ type ServiceSpec struct {
3535
// LWS supports both single-host and multi-host scenarios, for single host
3636
// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
3737
WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
38-
// ElasticConfig defines the configuration for elastic usage,
39-
// e.g. the max/min replicas. Default to 0 ~ Inf+.
40-
// This requires to install the HPA first or will not work.
41-
// +optional
42-
ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
4338
}
4439

4540
const (

api/inference/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/inference/v1alpha1/servicespec.go

Lines changed: 0 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/inference.llmaz.io_playgrounds.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,27 @@ spec:
215215
from the default version.
216216
type: string
217217
type: object
218+
elasticConfig:
219+
description: |-
220+
ElasticConfig defines the configuration for elastic usage,
221+
e.g. the max/min replicas. Default to 0 ~ Inf+.
222+
This requires to install the HPA first or will not work.
223+
properties:
224+
maxReplicas:
225+
description: |-
226+
MaxReplicas indicates the maximum number of inference workloads based on the traffic.
227+
Default to nil means there's no limit for the instance number.
228+
format: int32
229+
type: integer
230+
minReplicas:
231+
default: 1
232+
description: |-
233+
MinReplicas indicates the minimum number of inference workloads based on the traffic.
234+
Default to nil means we can scale down the instances to 1.
235+
If minReplicas set to 0, it requires to install serverless component at first.
236+
format: int32
237+
type: integer
238+
type: object
218239
modelClaim:
219240
description: |-
220241
ModelClaim represents claiming for one model, it's a simplified use case

config/crd/bases/inference.llmaz.io_services.yaml

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,27 +44,6 @@ spec:
4444
Service controller will maintain multi-flavor of workloads with
4545
different accelerators for cost or performance considerations.
4646
properties:
47-
elasticConfig:
48-
description: |-
49-
ElasticConfig defines the configuration for elastic usage,
50-
e.g. the max/min replicas. Default to 0 ~ Inf+.
51-
This requires to install the HPA first or will not work.
52-
properties:
53-
maxReplicas:
54-
description: |-
55-
MaxReplicas indicates the maximum number of inference workloads based on the traffic.
56-
Default to nil means there's no limit for the instance number.
57-
format: int32
58-
type: integer
59-
minReplicas:
60-
default: 1
61-
description: |-
62-
MinReplicas indicates the minimum number of inference workloads based on the traffic.
63-
Default to nil means we can scale down the instances to 1.
64-
If minReplicas set to 0, it requires to install serverless component at first.
65-
format: int32
66-
type: integer
67-
type: object
6847
modelClaims:
6948
description: ModelClaims represents multiple claims for different
7049
models.

config/crd/bases/llmaz.io_openmodels.yaml

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,13 @@ spec:
8686
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
8787
x-kubernetes-int-or-string: true
8888
description: |-
89-
Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
90-
When GPU number is greater than 8, like 32, then multi-host inference is enabled and
91-
32/8=4 hosts will be grouped as an unit, each host will have a resource request as
92-
nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
93-
Not recommended to set the cpu and memory usage here.
94-
If using playground, you can define the cpu/mem usage at backendConfig.
95-
If using service, you can define the cpu/mem at the container resources.
96-
Note: if you define the same accelerator requests at playground/service as well,
89+
Requests defines the required accelerators to serve the model for each replica,
90+
like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
91+
the resource requirements for each replica. This may change in the future.
92+
Not recommended to set the cpu and memory usage here:
93+
- if using playground, you can define the cpu/mem usage at backendConfig.
94+
- if using inference service, you can define the cpu/mem at the container resources.
95+
However, if you define the same accelerator requests at playground/service as well,
9796
the requests here will be covered.
9897
type: object
9998
required:

test/util/wrapper/playground.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
v1 "k8s.io/api/core/v1"
2323
"k8s.io/apimachinery/pkg/api/resource"
2424
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+
"k8s.io/utils/ptr"
2526
)
2627

2728
type PlaygroundWrapper struct {
@@ -154,3 +155,11 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper
154155
w.Spec.BackendRuntimeConfig.Resources.Limits[v1.ResourceName(r)] = resource.MustParse(v)
155156
return w
156157
}
158+
159+
func (w *PlaygroundWrapper) ElasticConfig(maxReplicas, minReplicas int32) *PlaygroundWrapper {
160+
w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
161+
MaxReplicas: ptr.To[int32](maxReplicas),
162+
MinReplicas: ptr.To[int32](minReplicas),
163+
}
164+
return w
165+
}

test/util/wrapper/service.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package wrapper
1919
import (
2020
corev1 "k8s.io/api/core/v1"
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22-
"k8s.io/utils/ptr"
2322
lws "sigs.k8s.io/lws/api/leaderworkerset/v1"
2423

2524
coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
@@ -65,14 +64,6 @@ func (w *ServiceWrapper) ModelClaims(modelNames []string, roles []string, flavor
6564
return w
6665
}
6766

68-
func (w *ServiceWrapper) ElasticConfig(maxReplicas, minReplicas int32) *ServiceWrapper {
69-
w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
70-
MaxReplicas: ptr.To[int32](maxReplicas),
71-
MinReplicas: ptr.To[int32](minReplicas),
72-
}
73-
return w
74-
}
75-
7667
func (w *ServiceWrapper) WorkerTemplate() *ServiceWrapper {
7768
w.Spec.WorkloadTemplate.RolloutStrategy = lws.RolloutStrategy{
7869
Type: lws.RollingUpdateStrategyType,

0 commit comments

Comments
 (0)