Merge pull request #402 from cr7258/speculative-llamacpp

kerthcet · web-flow · commit f1fa51f69171 · 2025-05-09T13:54:30.000+08:00
feat: support speculative decoding for llamacpp
diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go
@@ -42,7 +42,6 @@ type BackendRuntimeConfig struct {
 	// ConfigName represents the recommended configuration name for the backend,
 	// It will be inferred from the models in the runtime if not specified, e.g. default,
 	// speculative-decoding.
-	// +kubebuilder:default=default
 	ConfigName *string `json:"configName,omitempty"`
 	// Args defined here will "append" the args defined in the recommendedConfig,
 	// either explicitly configured in configName or inferred in the runtime.
diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml
@@ -49,6 +49,20 @@ spec:
         limits:
           cpu: 2
           memory: 4Gi
+    - name: speculative-decoding
+      args:
+        - -m
+        - "{{`{{ .ModelPath }}`}}"
+        - -md
+        - "{{`{{ .DraftModelPath }}`}}"
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8080"
+        - --draft-max
+        - "16"
+        - --draft-min
+        - "5"
   startupProbe:
     periodSeconds: 10
     failureThreshold: 30
diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -59,7 +59,6 @@ spec:
                       the hood, e.g. vLLM.
                     type: string
                   configName:
-                    default: default
                     description: |-
                       ConfigName represents the recommended configuration name for the backend,
                       It will be inferred from the models in the runtime if not specified, e.g. default,
diff --git a/docs/examples/README.md b/docs/examples/README.md
@@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default
 - [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm)
 - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
 - [Deploy models via ollama](#deploy-models-via-ollama)
+- [Speculative Decoding with llama.cpp](#speculative-decoding-with-llamacpp)
 - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
 - [Multi-Host Inference](#multi-host-inference)
 - [Deploy Host Models](#deploy-host-models)
@@ -59,6 +60,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 [ollama](https://github.com/ollama/ollama) based on llama.cpp, aims for local deploy. see [example](./ollama/) here.
 
+### Speculative Decoding with llama.cpp
+
+llama.cpp supports speculative decoding to significantly improve inference performance, see [example](./speculative-decoding/llamacpp/) here.
+
 ### Speculative Decoding with vLLM
 
 [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.
diff --git a/docs/examples/speculative-decoding/llamacpp/playground.yaml b/docs/examples/speculative-decoding/llamacpp/playground.yaml
@@ -1,5 +1,5 @@
 # This is just an toy example, because it doesn't make any sense
-# in real world, drafting tokens for the model with similar size.
+# in real world, drafting tokens for the model with smaller size.
 apiVersion: llmaz.io/v1alpha1
 kind: OpenModel
 metadata:
@@ -38,10 +38,3 @@ spec:
     backendName: llamacpp
     args:
       - -fa # use flash attention
-    resources:
-      requests:
-        cpu: 4
-        memory: "8Gi"
-      limits:
-        cpu: 4
-        memory: "8Gi"
diff --git a/test/config/backends/llamacpp.yaml b/test/config/backends/llamacpp.yaml
@@ -29,21 +29,20 @@ spec:
         limits:
           cpu: 2
           memory: 4Gi
-    # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240.
-    # - name: speculative-decoding
-    #   args:
-    #     - -m
-    #     - "{{ .ModelPath }}"
-    #     - -md
-    #     - "{{ .DraftModelPath }}"
-    #     - --host
-    #     - "0.0.0.0"
-    #     - --port
-    #     - "8080"
-    #     - --draft-max
-    #     - "16"
-    #     - --draft-min
-    #     - "5"
+    - name: speculative-decoding
+      args:
+        - -m
+        - "{{ .ModelPath }}"
+        - -md
+        - "{{ .DraftModelPath }}"
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8080"
+        - --draft-max
+        - "16"
+        - --draft-min
+        - "5"
   startupProbe:
     periodSeconds: 10
     failureThreshold: 30
diff --git a/test/e2e/playground_test.go b/test/e2e/playground_test.go
@@ -142,32 +142,29 @@ var _ = ginkgo.Describe("playground e2e tests", func() {
 		hpa := &autoscalingv2.HorizontalPodAutoscaler{}
 		gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, hpa)).To(gomega.Succeed())
 	})
-	// TODO: add e2e tests.
-	// ginkgo.It("SpeculativeDecoding with llama.cpp", func() {
-	// 	targetModel := wrapper.MakeModel("llama2-7b-q8-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q8_0.gguf", "", nil, nil).Obj()
-	// 	gomega.Expect(k8sClient.Create(ctx, targetModel)).To(gomega.Succeed())
-	// 	defer func() {
-	// 		gomega.Expect(k8sClient.Delete(ctx, targetModel)).To(gomega.Succeed())
-	// 	}()
-	// 	draftModel := wrapper.MakeModel("llama2-7b-q2-k-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q2_K.gguf", "", nil, nil).Obj()
-	// 	gomega.Expect(k8sClient.Create(ctx, draftModel)).To(gomega.Succeed())
-	// 	defer func() {
-	// 		gomega.Expect(k8sClient.Delete(ctx, draftModel)).To(gomega.Succeed())
-	// 	}()
-
-	// 	playground := wrapper.MakePlayground("llamacpp-speculator", ns.Name).
-	// 		MultiModelsClaim([]string{"llama2-7b-q8-gguf", "llama2-7b-q2-k-gguf"}, coreapi.SpeculativeDecoding).
-	// 		BackendRuntime("llamacpp").BackendLimit("cpu", "4").BackendRequest("memory", "8Gi").
-	// 		Replicas(1).
-	// 		Obj()
-	// 	gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
-	// 	validation.ValidatePlayground(ctx, k8sClient, playground)
-	// 	validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue)
-
-	// 	service := &inferenceapi.Service{}
-	// 	gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, service)).To(gomega.Succeed())
-	// 	validation.ValidateService(ctx, k8sClient, service)
-	// 	validation.ValidateServiceStatusEqualTo(ctx, k8sClient, service, inferenceapi.ServiceAvailable, "ServiceReady", metav1.ConditionTrue)
-	// 	validation.ValidateServicePods(ctx, k8sClient, service)
-	// })
+	ginkgo.It("SpeculativeDecoding with llama.cpp", func() {
+		targetModel := wrapper.MakeModel("llama2-7b-q8-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q8_0.gguf", "", nil, nil).Obj()
+		gomega.Expect(k8sClient.Create(ctx, targetModel)).To(gomega.Succeed())
+		defer func() {
+			gomega.Expect(k8sClient.Delete(ctx, targetModel)).To(gomega.Succeed())
+		}()
+		draftModel := wrapper.MakeModel("llama2-7b-q2-k-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q2_K.gguf", "", nil, nil).Obj()
+		gomega.Expect(k8sClient.Create(ctx, draftModel)).To(gomega.Succeed())
+		defer func() {
+			gomega.Expect(k8sClient.Delete(ctx, draftModel)).To(gomega.Succeed())
+		}()
+
+		playground := wrapper.MakePlayground("llamacpp-speculator", ns.Name).
+			ModelClaims([]string{"llama2-7b-q8-gguf", "llama2-7b-q2-k-gguf"}, []string{"main", "draft"}).
+			BackendRuntime("llamacpp").Replicas(1).Obj()
+		gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
+		validation.ValidatePlayground(ctx, k8sClient, playground)
+		validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue)
+
+		service := &inferenceapi.Service{}
+		gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, service)).To(gomega.Succeed())
+		validation.ValidateService(ctx, k8sClient, service)
+		validation.ValidateServiceStatusEqualTo(ctx, k8sClient, service, inferenceapi.ServiceAvailable, "ServiceReady", metav1.ConditionTrue)
+		validation.ValidateServicePods(ctx, k8sClient, service)
+	})
 })