Skip to content

Commit f02361f

Browse files
Merge pull request #139 from kerthcet/feat/backendRuntime
[2/N] Add backendRuntime implementation
2 parents 49009ae + 4b145f4 commit f02361f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+675
-863
lines changed

api/core/v1alpha1/model_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ type ModelStatus struct {
181181
//+genclient
182182
//+kubebuilder:object:root=true
183183
//+kubebuilder:subresource:status
184-
//+kubebuilder:resource:scope=Cluster
184+
//+kubebuilder:resource:shortName=om,scope=Cluster
185185

186186
// OpenModel is the Schema for the open models API
187187
type OpenModel struct {

api/inference/v1alpha1/backendruntime_types.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ import (
2424
type InferenceMode string
2525

2626
const (
27-
DefaultInferenceMode InferenceMode = "default"
28-
SpeculativeDecodingInferenceMode InferenceMode = "speculative-decoding"
27+
DefaultInferenceMode InferenceMode = "Default"
28+
SpeculativeDecodingInferenceMode InferenceMode = "SpeculativeDecoding"
2929
)
3030

3131
type BackendRuntimeArg struct {
@@ -47,6 +47,7 @@ type BackendRuntimeSpec struct {
4747
// They can be appended or overwritten by the Playground args.
4848
// The key is the inference option, like default one or advanced
4949
// speculativeDecoding, the values are the corresponding args.
50+
// Flag around with {{ .XXX }} is a flag waiting for render.
5051
Args []BackendRuntimeArg `json:"args,omitempty"`
5152
// Envs represents the environments set to the container.
5253
// +optional
@@ -65,7 +66,7 @@ type BackendRuntimeStatus struct {
6566

6667
//+kubebuilder:object:root=true
6768
//+kubebuilder:subresource:status
68-
//+kubebuilder:resource:scope=Cluster
69+
//+kubebuilder:resource:shortName=br,scope=Cluster
6970

7071
// BackendRuntime is the Schema for the backendRuntime API
7172
type BackendRuntime struct {

api/inference/v1alpha1/config_types.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ const (
3030

3131
type BackendRuntimeConfig struct {
3232
// Name represents the inference backend under the hood, e.g. vLLM.
33-
// +kubebuilder:validation:Enum={vllm,sglang,llamacpp}
3433
// +kubebuilder:default=vllm
3534
// +optional
3635
Name *BackendName `json:"name,omitempty"`

api/inference/v1alpha1/playground_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ type PlaygroundStatus struct {
6161
//+genclient
6262
//+kubebuilder:object:root=true
6363
//+kubebuilder:subresource:status
64+
//+kubebuilder:resource:shortName={pl}
6465

6566
// Playground is the Schema for the playgrounds API
6667
type Playground struct {

config/crd/bases/inference.llmaz.io_backendruntimes.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ spec:
1111
kind: BackendRuntime
1212
listKind: BackendRuntimeList
1313
plural: backendruntimes
14+
shortNames:
15+
- br
1416
singular: backendruntime
1517
scope: Cluster
1618
versions:
@@ -45,6 +47,7 @@ spec:
4547
They can be appended or overwritten by the Playground args.
4648
The key is the inference option, like default one or advanced
4749
speculativeDecoding, the values are the corresponding args.
50+
Flag around with {{ .XXX }} is a flag waiting for render.
4851
items:
4952
properties:
5053
flags:

config/crd/bases/inference.llmaz.io_playgrounds.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ spec:
1111
kind: Playground
1212
listKind: PlaygroundList
1313
plural: playgrounds
14+
shortNames:
15+
- pl
1416
singular: playground
1517
scope: Namespaced
1618
versions:
@@ -179,10 +181,6 @@ spec:
179181
default: vllm
180182
description: Name represents the inference backend under the hood,
181183
e.g. vLLM.
182-
enum:
183-
- vllm
184-
- sglang
185-
- llamacpp
186184
type: string
187185
resources:
188186
description: |-

config/crd/bases/llmaz.io_openmodels.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ spec:
1111
kind: OpenModel
1212
listKind: OpenModelList
1313
plural: openmodels
14+
shortNames:
15+
- om
1416
singular: openmodel
1517
scope: Cluster
1618
versions:

config/rbac/role.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ rules:
3434
- apiGroups:
3535
- inference.llmaz.io
3636
resources:
37-
- backends
37+
- backendruntimes
3838
verbs:
3939
- create
4040
- delete
@@ -46,13 +46,13 @@ rules:
4646
- apiGroups:
4747
- inference.llmaz.io
4848
resources:
49-
- backends/finalizers
49+
- backendruntimes/finalizers
5050
verbs:
5151
- update
5252
- apiGroups:
5353
- inference.llmaz.io
5454
resources:
55-
- backends/status
55+
- backendruntimes/status
5656
verbs:
5757
- get
5858
- patch

docs/assets/arch.png

6.32 KB
Loading

docs/examples/llamacpp/playground.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ spec:
66
replicas: 1
77
modelClaim:
88
modelName: qwen2-0--5b-gguf
9-
backendConfig:
9+
backendRuntimeConfig:
1010
name: llamacpp
1111
args:
1212
- -fa # use flash attention

0 commit comments

Comments
 (0)