Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 27 additions & 34 deletions api/inference/v1alpha1/backendruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,10 @@ package v1alpha1
import (
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// BackendRuntimeArg is the preset arguments for easy to use.
// Three preset names are provided: default, speculative-decoding, model-parallelism,
// do not change the name.
type BackendRuntimeArg struct {
// Name represents the identifier of the backendRuntime argument.
// +kubebuilder:default=default
// +optional
Name *string `json:"name,omitempty"`
// Flags represents all the preset configurations.
// Flag around with {{ .CONFIG }} is a configuration waiting for render.
Flags []string `json:"flags,omitempty"`
}

// HPATrigger represents the configuration of the HorizontalPodAutoscaler.
// Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
// Note: HPA component should be installed in prior.
Expand All @@ -55,17 +43,6 @@ type HPATrigger struct {
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
}

// NamedScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time. The name is used to identify
// the trigger in backendRuntime.
type NamedScaleTrigger struct {
// Name represents the identifier of the scale trigger, e.g. some triggers defined for
// latency sensitive workloads, some are defined for throughput sensitive workloads.
Name string `json:"name,omitempty"`
// HPA represents the trigger configuration of the HorizontalPodAutoscaler.
HPA *HPATrigger `json:"hpa,omitempty"`
}

// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time, mostly used in Playground.
type ScaleTrigger struct {
Expand All @@ -83,6 +60,30 @@ type MultiHostCommands struct {
Worker []string `json:"worker,omitempty"`
}

// RecommendedConfig represents the recommended configurations for the backendRuntime,
// user can choose one of them to apply.
type RecommendedConfig struct {
// Name represents the identifier of the config.
Name string `json:"name"`
// Args represents all the arguments for the command.
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
// +optional
Args []string `json:"args,omitempty"`
// Resources represents the resource requirements for backend, like cpu/mem,
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be overwritten.
// +optional
Resources *ResourceRequirements `json:"resources,omitempty"`
// SharedMemorySize represents the size of /dev/shm required in the runtime of
// inference workload.
// +optional
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}

// BackendRuntimeSpec defines the desired state of BackendRuntime
type BackendRuntimeSpec struct {
// Commands represents the default commands for the backendRuntime.
Expand All @@ -98,16 +99,9 @@ type BackendRuntimeSpec struct {
// Version represents the default version of the backendRuntime.
// It will be appended to the image as a tag.
Version string `json:"version"`
// Args represents the preset arguments of the backendRuntime.
// They can be appended or overwritten by the Playground backendRuntimeConfig.
Args []BackendRuntimeArg `json:"args,omitempty"`
// Envs represents the environments set to the container.
// +optional
Envs []corev1.EnvVar `json:"envs,omitempty"`
// Resources represents the resource requirements for backendRuntime, like cpu/mem,
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be overwritten.
Resources ResourceRequirements `json:"resources"`
// Periodic probe of backend liveness.
// Backend will be restarted if the probe fails.
// Cannot be updated.
Expand All @@ -124,10 +118,9 @@ type BackendRuntimeSpec struct {
// when it might take a long time to load data or warm a cache, than during steady-state operation.
// +optional
StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
// ScaleTriggers represents a set of triggers preset to be used by Playground.
// If Playground not specify the scale trigger, the 0-index trigger will be used.
// RecommendedConfigs represents the recommended configurations for the backendRuntime.
// +optional
ScaleTriggers []NamedScaleTrigger `json:"scaleTriggers,omitempty"`
RecommendedConfigs []RecommendedConfig `json:"recommendedConfigs,omitempty"`
}

// BackendRuntimeStatus defines the observed state of BackendRuntime
Expand Down
56 changes: 20 additions & 36 deletions api/inference/v1alpha1/config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,43 @@ const (
)

type BackendRuntimeConfig struct {
// Name represents the inference backend under the hood, e.g. vLLM.
// BackendName represents the inference backend under the hood, e.g. vLLM.
// +kubebuilder:default=vllm
// +optional
Name *BackendName `json:"name,omitempty"`
BackendName *BackendName `json:"backendName,omitempty"`
// Version represents the backend version if you want a different one
// from the default version.
// +optional
Version *string `json:"version,omitempty"`
// Envs represents the environments set to the container.
// +optional
Envs []corev1.EnvVar `json:"envs,omitempty"`

// ConfigName represents the recommended configuration name for the backend,
// It will be inferred from the models in the runtime if not specified, e.g. default,
// speculative-decoding or model-parallelism.
ConfigName *string `json:"configName,omitempty"`
// Args represents all the arguments for the command.
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
// +optional
// Args defined here will "append" the args in the recommendedConfig.
// +optional
Args []string `json:"args,omitempty"`
// Resources represents the resource requirements for backend, like cpu/mem,
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be overwritten.
// Resources defined here will "overwrite" the resources in the recommendedConfig.
// +optional
Resources *ResourceRequirements `json:"resources,omitempty"`
// SharedMemorySize represents the size of /dev/shm required in the runtime of
// inference workload.
// SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
// +optional
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
// Args represents the specified arguments of the backendRuntime,
// will be append to the backendRuntime.spec.Args.
Args *BackendRuntimeArg `json:"args,omitempty"`
// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time, mostly used in Playground.
// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}

// TODO: Do not support DRA yet, we can support that once needed.
Expand All @@ -66,33 +80,3 @@ type ResourceRequirements struct {
// +optional
Requests corev1.ResourceList `json:"requests,omitempty"`
}

// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime.
type ScaleTriggerRef struct {
// Name represents the scale trigger name defined in the backendRuntime.scaleTriggers.
Name string `json:"name"`
}

type ElasticConfig struct {
// MinReplicas indicates the minimum number of inference workloads based on the traffic.
// Default to 1.
// MinReplicas couldn't be 0 now, will support serverless in the future.
// +kubebuilder:default=1
// +optional
MinReplicas *int32 `json:"minReplicas,omitempty"`
// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
// Default to nil means there's no limit for the instance number.
// +optional
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime
// with tuned target value.
// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
// +optional
ScaleTriggerRef *ScaleTriggerRef `json:"scaleTriggerRef,omitempty"`
// ScaleTrigger defines a set of triggers to scale the workloads.
// If not defined, trigger configured in backendRuntime will be used,
// otherwise, trigger defined here will overwrite the defaulted ones.
// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}
15 changes: 13 additions & 2 deletions api/inference/v1alpha1/playground_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,22 @@ type PlaygroundSpec struct {
BackendRuntimeConfig *BackendRuntimeConfig `json:"backendRuntimeConfig,omitempty"`
// ElasticConfig defines the configuration for elastic usage,
// e.g. the max/min replicas.
// Note: this requires to install the HPA first or will report error.
// +optional
ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
}

type ElasticConfig struct {
// MinReplicas indicates the minimum number of inference workloads based on the traffic.
// Default to 1.
// MinReplicas couldn't be 0 now, will support serverless in the future.
// +kubebuilder:default=1
// +optional
MinReplicas *int32 `json:"minReplicas,omitempty"`
// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
// Default to nil means there's no limit for the instance number.
// +optional
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
}

const (
// PlaygroundProgressing means the Playground is progressing now, such as waiting for the
// inference service creation, rolling update or scaling up and down.
Expand Down
Loading
Loading