Skip to content

Commit 89ef2e6

Browse files
authored
feat: support runai streamer for vllm (#423)
* feat: support runai streamer for vllm * refactor * refactor * fix * refactor * add unit tests * fix * fix * fix * fix * gcs * refactor v2 * e2e * add SkipModelLoader help function * merge ValidateSkipModelLoaderService into ValidateService * fix * fix * fix e2e tests
1 parent 2a55f84 commit 89ef2e6

File tree

16 files changed

+675
-107
lines changed

16 files changed

+675
-107
lines changed

api/inference/v1alpha1/playground_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ const (
7373
PlaygroundProgressing = "Progressing"
7474
// PlaygroundAvailable indicates the corresponding inference service is available now.
7575
PlaygroundAvailable string = "Available"
76+
// SkipModelLoaderAnnoKey indicates whether to skip the model loader,
77+
// enabling the inference engine to manage model loading directly.
78+
SkipModelLoaderAnnoKey = "llmaz.io/skip-model-loader"
7679
)
7780

7881
// PlaygroundStatus defines the observed state of Playground

docs/examples/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ llama.cpp supports speculative decoding to significantly improve inference perfo
6868

6969
[Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.
7070

71+
### Loading models with Run:ai Model Streamer with vLLM
72+
73+
[Run:ai Model Streamer](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md) is a library to read tensors in concurrency, while streaming it to GPU memory. vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer. See [example](./runai-streamer/) here.
74+
7175
### Multi-Host Inference
7276

7377
Model size is growing bigger and bigger, Llama 3.1 405B FP16 LLM requires more than 750 GB GPU for weights only, leaving kv cache unconsidered, even with 8 x H100 Nvidia GPUs, 80 GB size of HBM each, can not fit in a single host, requires a multi-host deployment, see [example](./multi-nodes/) here.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# This example demonstrates how to use the Run:ai Model Streamer to load models from the local file system.
2+
# The model-loader initContainer first downloads the model from Hugging Face.
3+
# By using `--load-format runai_streamer`, vLLM leverages the Run:ai Model Streamer to stream models from the local file system.
4+
# While this approach may be slightly slower than streaming directly from S3 (due to the initial download to local disk),
5+
# it still offers faster model loading compared to not using the Streamer,
6+
# as it utilizes multiple threads to concurrently read tensor data from files into a dedicated CPU buffer,
7+
# and then transfers the tensors to GPU memory.
8+
apiVersion: llmaz.io/v1alpha1
9+
kind: OpenModel
10+
metadata:
11+
name: deepseek-r1-distill-qwen-1-5b
12+
spec:
13+
familyName: deepseek
14+
source:
15+
modelHub:
16+
modelID: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
17+
inferenceConfig:
18+
flavors:
19+
- name: t4 # GPU type
20+
limits:
21+
nvidia.com/gpu: 1
22+
---
23+
apiVersion: inference.llmaz.io/v1alpha1
24+
kind: Playground
25+
metadata:
26+
name: deepseek-r1-distill-qwen-1-5b
27+
spec:
28+
replicas: 1
29+
modelClaim:
30+
modelName: deepseek-r1-distill-qwen-1-5b
31+
backendRuntimeConfig:
32+
backendName: vllm # currently, only vllm supports runai streamer
33+
args:
34+
- --load-format
35+
- runai_streamer
36+
resources:
37+
limits:
38+
cpu: "4"
39+
memory: 16Gi
40+
nvidia.com/gpu: "1"
41+
requests:
42+
cpu: "4"
43+
memory: 16Gi
44+
nvidia.com/gpu: "1"
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# This example demonstrates how to use the Run:ai Model Streamer to load models directly from S3.
2+
# Adding the annotation `llmaz.io/skip-model-loader: "true"` skips the model-loader initContainer,
3+
# allowing the inference engine to load models directly from remote storage (e.g., S3).
4+
# By using `--load-format runai_streamer`, the vLLM leverages the Run:ai Model Streamer to stream models from S3.
5+
# vLLM will load models into the CPU buffer and then into GPU memory, without the need to download them to local disk first.
6+
# This can significantly improve model loading speed and reduce disk usage.
7+
apiVersion: llmaz.io/v1alpha1
8+
kind: OpenModel
9+
metadata:
10+
name: deepseek-r1-distill-qwen-1-5b
11+
spec:
12+
familyName: deepseek
13+
source:
14+
# Note: You need to replace <YOUR_S3_BUCKET> with your actual S3 bucket name
15+
# If the s3 bucket need AWS credentials for authentication,
16+
# please run `kubectl create secret generic aws-access-secret --from-literal=AWS_ACCESS_KEY_ID=<YOUR_ACCESS_KEY_ID> --from-literal=AWS_SECRET_ACCESS_KEY=<YOUR_SECRET_ACCESS_KEY>` ahead.
17+
uri: s3://<YOUR_S3_BUCKET>/DeepSeek-R1-Distill-Qwen-1.5B
18+
inferenceConfig:
19+
flavors:
20+
- name: t4 # GPU type
21+
limits:
22+
nvidia.com/gpu: 1
23+
---
24+
apiVersion: inference.llmaz.io/v1alpha1
25+
kind: Playground
26+
metadata:
27+
name: deepseek-r1-distill-qwen-1-5b
28+
annotations:
29+
llmaz.io/skip-model-loader: "true"
30+
spec:
31+
replicas: 1
32+
modelClaim:
33+
modelName: deepseek-r1-distill-qwen-1-5b
34+
backendRuntimeConfig:
35+
backendName: vllm # currently, only vllm supports runai streamer
36+
args:
37+
- --load-format
38+
- runai_streamer
39+
envs:
40+
# The default value is 1 second. Increase it to 10 seconds to avoid timeouts in case of slow network conditions.
41+
- name: RUNAI_STREAMER_S3_REQUEST_TIMEOUT_MS
42+
value: "10000"
43+
# Controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer, the default value is 16
44+
#- name: RUNAI_STREAMER_CONCURRENCY
45+
# value: "32"
46+
resources:
47+
limits:
48+
cpu: "4"
49+
memory: 16Gi
50+
nvidia.com/gpu: "1"
51+
requests:
52+
cpu: "4"
53+
memory: 16Gi
54+
nvidia.com/gpu: "1"

pkg/controller/inference/playground_controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,15 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf
201201
// Build metadata
202202
serviceApplyConfiguration := inferenceclientgo.Service(playground.Name, playground.Namespace)
203203

204+
if annotations := playground.GetAnnotations(); annotations != nil {
205+
// Propagate llmaz.io/skip-model-loader annotation to Inference Service.
206+
if value, exists := annotations[inferenceapi.SkipModelLoaderAnnoKey]; exists {
207+
serviceApplyConfiguration.WithAnnotations(map[string]string{
208+
inferenceapi.SkipModelLoaderAnnoKey: value,
209+
})
210+
}
211+
}
212+
204213
// Build spec.
205214
spec := inferenceclientgo.ServiceSpec()
206215

pkg/controller/inference/service_controller.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,26 @@ func injectModelProperties(template *applyconfigurationv1.LeaderWorkerTemplateAp
196196

197197
for i, model := range models {
198198
source := modelSource.NewModelSourceProvider(model)
199+
// Skip model-loader initContainer if llmaz.io/skip-model-loader annotation is set.
200+
if !helper.SkipModelLoader(service) {
201+
if isMultiNodesInference {
202+
source.InjectModelLoader(template.LeaderTemplate, i)
203+
}
204+
source.InjectModelLoader(template.WorkerTemplate, i)
205+
} else {
206+
if isMultiNodesInference {
207+
source.InjectModelEnvVars(template.LeaderTemplate)
208+
}
209+
source.InjectModelEnvVars(template.WorkerTemplate)
210+
}
211+
}
212+
213+
// If model-loader initContainer is injected, we should mount the model-volume to the model-runner container.
214+
if !helper.SkipModelLoader(service) {
199215
if isMultiNodesInference {
200-
source.InjectModelLoader(template.LeaderTemplate, i)
216+
modelSource.InjectModelVolume(template.LeaderTemplate)
201217
}
202-
source.InjectModelLoader(template.WorkerTemplate, i)
218+
modelSource.InjectModelVolume(template.WorkerTemplate)
203219
}
204220

205221
// We only consider the main model's requirements for now.

pkg/controller_helper/backendruntime/backendruntime.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,14 @@ func (p *BackendRuntimeParser) Args() ([]string, error) {
6565

6666
source := modelSource.NewModelSourceProvider(mainModel)
6767
modelInfo := map[string]string{
68-
"ModelPath": source.ModelPath(),
68+
"ModelPath": source.ModelPath(helper.SkipModelLoader(p.playground)),
6969
"ModelName": source.ModelName(),
7070
}
7171

7272
// TODO: This is not that reliable because two models doesn't always means speculative-decoding.
7373
// Revisit this later.
7474
if len(p.models) > 1 {
75-
modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(p.models[1]).ModelPath()
75+
modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(p.models[1]).ModelPath(helper.SkipModelLoader(p.playground))
7676
}
7777

7878
for _, recommend := range p.backendRuntime.Spec.RecommendedConfigs {

pkg/controller_helper/helper.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121

2222
coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
2323
inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1"
24+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2425
"k8s.io/apimachinery/pkg/types"
2526
"sigs.k8s.io/controller-runtime/pkg/client"
2627
)
@@ -122,3 +123,10 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play
122123

123124
return nil
124125
}
126+
127+
func SkipModelLoader(obj metav1.Object) bool {
128+
if annotations := obj.GetAnnotations(); annotations != nil {
129+
return annotations[inferenceapi.SkipModelLoaderAnnoKey] == "true"
130+
}
131+
return false
132+
}

pkg/controller_helper/modelsource/modelhub.go

Lines changed: 67 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ func (p *ModelHubProvider) ModelName() string {
5050
// - modelID: Qwen/Qwen2-0.5B-Instruct-GGUF
5151
// fileName: qwen2-0_5b-instruct-q5_k_m.gguf
5252
// modelPath: /workspace/models/qwen2-0_5b-instruct-q5_k_m.gguf
53-
func (p *ModelHubProvider) ModelPath() string {
53+
func (p *ModelHubProvider) ModelPath(skipModelLoader bool) string {
54+
// Skip the model loader to allow the inference engine to handle loading models directly from model hub (e.g., Hugging Face, ModelScope).
55+
// In this case, the model ID should be returned (e.g., facebook/opt-125m).
56+
if skipModelLoader {
57+
return p.modelID
58+
}
59+
5460
if p.fileName != nil {
5561
return CONTAINER_MODEL_PATH + *p.fileName
5662
}
@@ -108,59 +114,87 @@ func (p *ModelHubProvider) InjectModelLoader(template *corev1.PodTemplateSpec, i
108114
// Both HUGGING_FACE_HUB_TOKEN and HF_TOKEN works.
109115
initContainer.Env = append(initContainer.Env,
110116
corev1.EnvVar{
111-
Name: "HUGGING_FACE_HUB_TOKEN",
117+
Name: HUGGING_FACE_HUB_TOKEN,
112118
ValueFrom: &corev1.EnvVarSource{
113119
SecretKeyRef: &corev1.SecretKeySelector{
114120
LocalObjectReference: corev1.LocalObjectReference{
115121
Name: MODELHUB_SECRET_NAME, // if secret not exists, the env is empty.
116122
},
117-
Key: HUGGINGFACE_TOKEN_KEY,
123+
Key: HUGGING_FACE_TOKEN_KEY,
118124
Optional: ptr.To[bool](true),
119125
},
120126
},
121-
}, corev1.EnvVar{
122-
Name: "HF_TOKEN",
127+
})
128+
129+
initContainer.Env = append(initContainer.Env,
130+
corev1.EnvVar{
131+
Name: HUGGING_FACE_TOKEN_KEY,
123132
ValueFrom: &corev1.EnvVarSource{
124133
SecretKeyRef: &corev1.SecretKeySelector{
125134
LocalObjectReference: corev1.LocalObjectReference{
126135
Name: MODELHUB_SECRET_NAME,
127136
},
128-
Key: HUGGINGFACE_TOKEN_KEY,
137+
Key: HUGGING_FACE_TOKEN_KEY,
129138
Optional: ptr.To[bool](true),
130139
},
131140
},
132-
},
133-
)
134-
template.Spec.InitContainers = append(template.Spec.InitContainers, *initContainer)
141+
})
135142

136-
// Return once not the main model, because all the below has already been injected.
137-
if index != 0 {
138-
return
139-
}
143+
template.Spec.InitContainers = append(template.Spec.InitContainers, *initContainer)
144+
}
140145

141-
// Handle container.
146+
func spreadEnvToInitContainer(containerEnv []corev1.EnvVar, initContainer *corev1.Container) {
147+
initContainer.Env = append(initContainer.Env, containerEnv...)
148+
}
142149

150+
func (p *ModelHubProvider) InjectModelEnvVars(template *corev1.PodTemplateSpec) {
143151
for i := range template.Spec.Containers {
144-
// We only consider this container.
145152
if template.Spec.Containers[i].Name == MODEL_RUNNER_CONTAINER_NAME {
146-
template.Spec.Containers[i].VolumeMounts = append(template.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{
147-
Name: MODEL_VOLUME_NAME,
148-
MountPath: CONTAINER_MODEL_PATH,
149-
ReadOnly: true,
150-
})
153+
// Check if HuggingFace token environment variables already exist
154+
hfHubTokenExists := false
155+
hfTokenExists := false
156+
for _, env := range template.Spec.Containers[i].Env {
157+
if env.Name == HUGGING_FACE_HUB_TOKEN {
158+
hfHubTokenExists = true
159+
}
160+
if env.Name == HUGGING_FACE_TOKEN_KEY {
161+
hfTokenExists = true
162+
}
163+
}
164+
165+
// Add HUGGING_FACE_HUB_TOKEN if it doesn't exist
166+
if !hfHubTokenExists {
167+
template.Spec.Containers[i].Env = append(template.Spec.Containers[i].Env,
168+
corev1.EnvVar{
169+
Name: HUGGING_FACE_HUB_TOKEN,
170+
ValueFrom: &corev1.EnvVarSource{
171+
SecretKeyRef: &corev1.SecretKeySelector{
172+
LocalObjectReference: corev1.LocalObjectReference{
173+
Name: MODELHUB_SECRET_NAME, // if secret not exists, the env is empty.
174+
},
175+
Key: HUGGING_FACE_TOKEN_KEY,
176+
Optional: ptr.To[bool](true),
177+
},
178+
},
179+
})
180+
}
181+
182+
// Add HF_TOKEN if it doesn't exist
183+
if !hfTokenExists {
184+
template.Spec.Containers[i].Env = append(template.Spec.Containers[i].Env,
185+
corev1.EnvVar{
186+
Name: HUGGING_FACE_TOKEN_KEY,
187+
ValueFrom: &corev1.EnvVarSource{
188+
SecretKeyRef: &corev1.SecretKeySelector{
189+
LocalObjectReference: corev1.LocalObjectReference{
190+
Name: MODELHUB_SECRET_NAME,
191+
},
192+
Key: HUGGING_FACE_TOKEN_KEY,
193+
Optional: ptr.To[bool](true),
194+
},
195+
},
196+
})
197+
}
151198
}
152199
}
153-
154-
// Handle spec.
155-
156-
template.Spec.Volumes = append(template.Spec.Volumes, corev1.Volume{
157-
Name: MODEL_VOLUME_NAME,
158-
VolumeSource: corev1.VolumeSource{
159-
EmptyDir: &corev1.EmptyDirVolumeSource{},
160-
},
161-
})
162-
}
163-
164-
func spreadEnvToInitContainer(containerEnv []corev1.EnvVar, initContainer *corev1.Container) {
165-
initContainer.Env = append(initContainer.Env, containerEnv...)
166200
}

0 commit comments

Comments
 (0)