vllm-project · ae86zhizhi · Aug 7, 2025 · Aug 7, 2025 · Jeffwan · Aug 7, 2025
diff --git a/samples/network-policies/allow-kv-events.yaml b/samples/network-policies/allow-kv-events.yaml
@@ -0,0 +1,24 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-kv-events
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
+  namespace: default
+spec:
+  podSelector:
+    matchLabels:
+      app: gateway-plugins
+  policyTypes:
+    - Ingress
+  ingress:
+    - from:
+        - podSelector:
+            matchLabels:
+              model.aibrix.ai/kv-events-enabled: "true"
+      ports:
+        - protocol: TCP
+          port: 5557
+        - protocol: TCP
+          port: 5558
diff --git a/samples/quickstart/model-with-kv-events-env.yaml b/samples/quickstart/model-with-kv-events-env.yaml
@@ -0,0 +1,94 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: llama-8b-instruct
+    model.aibrix.ai/kv-events-enabled: "true"
+  name: llama-8b-instruct
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
+  namespace: default
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: llama-8b-instruct
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: llama-8b-instruct
+        model.aibrix.ai/kv-events-enabled: "true"
+    spec:
+      containers:
+        - name: vllm-openai
+          image: vllm/vllm-openai:v0.7.1
+          command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+            - meta-llama/Llama-3.1-8B-Instruct
+            - --served-model-name
+            - llama-8b-instruct
+          env:
+            # NEW: KV event configuration via environment
+            - name: VLLM_ENABLE_KV_CACHE_EVENTS
+              value: "true"
+            - name: VLLM_KV_EVENTS_PUBLISHER
+              value: "zmq"
+            - name: VLLM_KV_EVENTS_ENDPOINT
+              value: "tcp://*:5557"
+            - name: VLLM_KV_EVENTS_REPLAY_ENDPOINT
+              value: "tcp://*:5558"
+            - name: VLLM_KV_EVENTS_BUFFER_STEPS
+              value: "10000"
+            # Performance tuning
+            - name: VLLM_KV_EVENTS_HWM
+              value: "100000"  # ZMQ high water mark
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+              name: api
+            - containerPort: 5557
+              protocol: TCP
+              name: kv-events
+            - containerPort: 5558
+              protocol: TCP
+              name: kv-replay
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 3
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 30
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
diff --git a/samples/quickstart/model-with-kv-events.yaml b/samples/quickstart/model-with-kv-events.yaml
@@ -0,0 +1,134 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    model.aibrix.ai/port: "8000"
+    model.aibrix.ai/kv-events-enabled: "true"  # NEW: Enable KV events
+  name: deepseek-r1-distill-llama-8b
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+        model.aibrix.ai/kv-events-enabled: "true"  # NEW: Required for discovery
+    spec:
+      containers:
+        - name: vllm-openai
+          image: vllm/vllm-openai:v0.7.1
+          command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --uvicorn-log-level
+            - warning
+            - --model
+            - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+            - --served-model-name
+            - deepseek-r1-distill-llama-8b
+            - --max-model-len
+            - "12288"
+            # NEW: KV event publishing configuration
+            - --enable-kv-cache-events
+            - --kv-events-publisher
+            - zmq
+            - --kv-events-endpoint
+            - "tcp://*:5557"
+            - --kv-events-replay-endpoint
+            - "tcp://*:5558"
+            - --kv-events-buffer-steps
+            - "10000"
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+              name: api
+            # NEW: KV event ports
+            - containerPort: 5557
+              protocol: TCP
+              name: kv-events
+            - containerPort: 5558
+              protocol: TCP
+              name: kv-replay
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          # Health checks remain the same
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 3
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 30
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-r1-distill-llama-8b
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: metrics
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+    # NEW: Expose KV event ports (optional, for debugging)
+    - name: kv-events
+      port: 5557
+      protocol: TCP
+      targetPort: 5557
+    - name: kv-replay
+      port: 5558
+      protocol: TCP
+      targetPort: 5558
+  selector:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  type: ClusterIP