Skip to content

Commit f1fa51f

Browse files
authored
Merge pull request #402 from cr7258/speculative-llamacpp
feat: support speculative decoding for llamacpp
2 parents 4f16a96 + fca2fbe commit f1fa51f

File tree

7 files changed

+59
-53
lines changed

7 files changed

+59
-53
lines changed

api/inference/v1alpha1/config_types.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ type BackendRuntimeConfig struct {
4242
// ConfigName represents the recommended configuration name for the backend,
4343
// It will be inferred from the models in the runtime if not specified, e.g. default,
4444
// speculative-decoding.
45-
// +kubebuilder:default=default
4645
ConfigName *string `json:"configName,omitempty"`
4746
// Args defined here will "append" the args defined in the recommendedConfig,
4847
// either explicitly configured in configName or inferred in the runtime.

chart/templates/backends/llamacpp.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,20 @@ spec:
4949
limits:
5050
cpu: 2
5151
memory: 4Gi
52+
- name: speculative-decoding
53+
args:
54+
- -m
55+
- "{{`{{ .ModelPath }}`}}"
56+
- -md
57+
- "{{`{{ .DraftModelPath }}`}}"
58+
- --host
59+
- "0.0.0.0"
60+
- --port
61+
- "8080"
62+
- --draft-max
63+
- "16"
64+
- --draft-min
65+
- "5"
5266
startupProbe:
5367
periodSeconds: 10
5468
failureThreshold: 30

config/crd/bases/inference.llmaz.io_playgrounds.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ spec:
5959
the hood, e.g. vLLM.
6060
type: string
6161
configName:
62-
default: default
6362
description: |-
6463
ConfigName represents the recommended configuration name for the backend,
6564
It will be inferred from the models in the runtime if not specified, e.g. default,

docs/examples/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default
1212
- [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm)
1313
- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
1414
- [Deploy models via ollama](#deploy-models-via-ollama)
15+
- [Speculative Decoding with llama.cpp](#speculative-decoding-with-llamacpp)
1516
- [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
1617
- [Multi-Host Inference](#multi-host-inference)
1718
- [Deploy Host Models](#deploy-host-models)
@@ -59,6 +60,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
5960

6061
[ollama](https://github.com/ollama/ollama) based on llama.cpp, aims for local deploy. see [example](./ollama/) here.
6162

63+
### Speculative Decoding with llama.cpp
64+
65+
llama.cpp supports speculative decoding to significantly improve inference performance, see [example](./speculative-decoding/llamacpp/) here.
66+
6267
### Speculative Decoding with vLLM
6368

6469
[Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.

docs/examples/speculative-decoding/llamacpp/playground.yaml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This is just an toy example, because it doesn't make any sense
2-
# in real world, drafting tokens for the model with similar size.
2+
# in real world, drafting tokens for the model with smaller size.
33
apiVersion: llmaz.io/v1alpha1
44
kind: OpenModel
55
metadata:
@@ -38,10 +38,3 @@ spec:
3838
backendName: llamacpp
3939
args:
4040
- -fa # use flash attention
41-
resources:
42-
requests:
43-
cpu: 4
44-
memory: "8Gi"
45-
limits:
46-
cpu: 4
47-
memory: "8Gi"

test/config/backends/llamacpp.yaml

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,20 @@ spec:
2929
limits:
3030
cpu: 2
3131
memory: 4Gi
32-
# TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240.
33-
# - name: speculative-decoding
34-
# args:
35-
# - -m
36-
# - "{{ .ModelPath }}"
37-
# - -md
38-
# - "{{ .DraftModelPath }}"
39-
# - --host
40-
# - "0.0.0.0"
41-
# - --port
42-
# - "8080"
43-
# - --draft-max
44-
# - "16"
45-
# - --draft-min
46-
# - "5"
32+
- name: speculative-decoding
33+
args:
34+
- -m
35+
- "{{ .ModelPath }}"
36+
- -md
37+
- "{{ .DraftModelPath }}"
38+
- --host
39+
- "0.0.0.0"
40+
- --port
41+
- "8080"
42+
- --draft-max
43+
- "16"
44+
- --draft-min
45+
- "5"
4746
startupProbe:
4847
periodSeconds: 10
4948
failureThreshold: 30

test/e2e/playground_test.go

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -142,32 +142,29 @@ var _ = ginkgo.Describe("playground e2e tests", func() {
142142
hpa := &autoscalingv2.HorizontalPodAutoscaler{}
143143
gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, hpa)).To(gomega.Succeed())
144144
})
145-
// TODO: add e2e tests.
146-
// ginkgo.It("SpeculativeDecoding with llama.cpp", func() {
147-
// targetModel := wrapper.MakeModel("llama2-7b-q8-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q8_0.gguf", "", nil, nil).Obj()
148-
// gomega.Expect(k8sClient.Create(ctx, targetModel)).To(gomega.Succeed())
149-
// defer func() {
150-
// gomega.Expect(k8sClient.Delete(ctx, targetModel)).To(gomega.Succeed())
151-
// }()
152-
// draftModel := wrapper.MakeModel("llama2-7b-q2-k-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q2_K.gguf", "", nil, nil).Obj()
153-
// gomega.Expect(k8sClient.Create(ctx, draftModel)).To(gomega.Succeed())
154-
// defer func() {
155-
// gomega.Expect(k8sClient.Delete(ctx, draftModel)).To(gomega.Succeed())
156-
// }()
157-
158-
// playground := wrapper.MakePlayground("llamacpp-speculator", ns.Name).
159-
// MultiModelsClaim([]string{"llama2-7b-q8-gguf", "llama2-7b-q2-k-gguf"}, coreapi.SpeculativeDecoding).
160-
// BackendRuntime("llamacpp").BackendLimit("cpu", "4").BackendRequest("memory", "8Gi").
161-
// Replicas(1).
162-
// Obj()
163-
// gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
164-
// validation.ValidatePlayground(ctx, k8sClient, playground)
165-
// validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue)
166-
167-
// service := &inferenceapi.Service{}
168-
// gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, service)).To(gomega.Succeed())
169-
// validation.ValidateService(ctx, k8sClient, service)
170-
// validation.ValidateServiceStatusEqualTo(ctx, k8sClient, service, inferenceapi.ServiceAvailable, "ServiceReady", metav1.ConditionTrue)
171-
// validation.ValidateServicePods(ctx, k8sClient, service)
172-
// })
145+
ginkgo.It("SpeculativeDecoding with llama.cpp", func() {
146+
targetModel := wrapper.MakeModel("llama2-7b-q8-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q8_0.gguf", "", nil, nil).Obj()
147+
gomega.Expect(k8sClient.Create(ctx, targetModel)).To(gomega.Succeed())
148+
defer func() {
149+
gomega.Expect(k8sClient.Delete(ctx, targetModel)).To(gomega.Succeed())
150+
}()
151+
draftModel := wrapper.MakeModel("llama2-7b-q2-k-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q2_K.gguf", "", nil, nil).Obj()
152+
gomega.Expect(k8sClient.Create(ctx, draftModel)).To(gomega.Succeed())
153+
defer func() {
154+
gomega.Expect(k8sClient.Delete(ctx, draftModel)).To(gomega.Succeed())
155+
}()
156+
157+
playground := wrapper.MakePlayground("llamacpp-speculator", ns.Name).
158+
ModelClaims([]string{"llama2-7b-q8-gguf", "llama2-7b-q2-k-gguf"}, []string{"main", "draft"}).
159+
BackendRuntime("llamacpp").Replicas(1).Obj()
160+
gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
161+
validation.ValidatePlayground(ctx, k8sClient, playground)
162+
validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue)
163+
164+
service := &inferenceapi.Service{}
165+
gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, service)).To(gomega.Succeed())
166+
validation.ValidateService(ctx, k8sClient, service)
167+
validation.ValidateServiceStatusEqualTo(ctx, k8sClient, service, inferenceapi.ServiceAvailable, "ServiceReady", metav1.ConditionTrue)
168+
validation.ValidateServicePods(ctx, k8sClient, service)
169+
})
173170
})

0 commit comments

Comments
 (0)