Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions samples/network-policies/allow-kv-events.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you see any issues without using NetworkPolicy?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gateway plugins are deployed in the aibrix-system namespace, while KV event producers (model pods) are deployed in the default namespace (for quickstart) or in user-specified namespaces. The current NetworkPolicy only permits same-namespace traffic, but the actual deployment requires cross-namespace communication. I will prepare and submit a patch soon.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ae86zhizhi Are you planning to submit a new PR or a new commit in this PR?

metadata:
name: allow-kv-events
# Note: This sample uses 'default' namespace for quickstart purposes.
# To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
namespace: default
spec:
podSelector:
matchLabels:
app: gateway-plugins
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
model.aibrix.ai/kv-events-enabled: "true"
ports:
- protocol: TCP
port: 5557
- protocol: TCP
port: 5558
94 changes: 94 additions & 0 deletions samples/quickstart/model-with-kv-events-env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
model.aibrix.ai/name: llama-8b-instruct
model.aibrix.ai/kv-events-enabled: "true"
name: llama-8b-instruct
# Note: This sample uses 'default' namespace for quickstart purposes.
# To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
namespace: default
spec:
replicas: 2
selector:
matchLabels:
model.aibrix.ai/name: llama-8b-instruct
template:
metadata:
labels:
model.aibrix.ai/name: llama-8b-instruct
model.aibrix.ai/kv-events-enabled: "true"
spec:
containers:
- name: vllm-openai
image: vllm/vllm-openai:v0.7.1
command:
- python3
- -m
- vllm.entrypoints.openai.api_server
- --host
- "0.0.0.0"
- --port
- "8000"
- --model
- meta-llama/Llama-3.1-8B-Instruct
- --served-model-name
- llama-8b-instruct
env:
# NEW: KV event configuration via environment
- name: VLLM_ENABLE_KV_CACHE_EVENTS
value: "true"
- name: VLLM_KV_EVENTS_PUBLISHER
value: "zmq"
- name: VLLM_KV_EVENTS_ENDPOINT
value: "tcp://*:5557"
- name: VLLM_KV_EVENTS_REPLAY_ENDPOINT
value: "tcp://*:5558"
- name: VLLM_KV_EVENTS_BUFFER_STEPS
value: "10000"
# Performance tuning
- name: VLLM_KV_EVENTS_HWM
value: "100000" # ZMQ high water mark
ports:
- containerPort: 8000
protocol: TCP
name: api
- containerPort: 5557
protocol: TCP
name: kv-events
- containerPort: 5558
protocol: TCP
name: kv-replay
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /health
port: 8000
scheme: HTTP
failureThreshold: 3
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
httpGet:
path: /health
port: 8000
scheme: HTTP
failureThreshold: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
startupProbe:
httpGet:
path: /health
port: 8000
scheme: HTTP
failureThreshold: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
134 changes: 134 additions & 0 deletions samples/quickstart/model-with-kv-events.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
model.aibrix.ai/name: deepseek-r1-distill-llama-8b
model.aibrix.ai/port: "8000"
model.aibrix.ai/kv-events-enabled: "true" # NEW: Enable KV events
name: deepseek-r1-distill-llama-8b
# Note: This sample uses 'default' namespace for quickstart purposes.
# To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
namespace: default
spec:
replicas: 1
selector:
matchLabels:
model.aibrix.ai/name: deepseek-r1-distill-llama-8b
template:
metadata:
labels:
model.aibrix.ai/name: deepseek-r1-distill-llama-8b
model.aibrix.ai/kv-events-enabled: "true" # NEW: Required for discovery
spec:
containers:
- name: vllm-openai
image: vllm/vllm-openai:v0.7.1
command:
- python3
- -m
- vllm.entrypoints.openai.api_server
- --host
- "0.0.0.0"
- --port
- "8000"
- --uvicorn-log-level
- warning
- --model
- deepseek-ai/DeepSeek-R1-Distill-Llama-8B
- --served-model-name
- deepseek-r1-distill-llama-8b
- --max-model-len
- "12288"
# NEW: KV event publishing configuration
- --enable-kv-cache-events
- --kv-events-publisher
- zmq
- --kv-events-endpoint
- "tcp://*:5557"
- --kv-events-replay-endpoint
- "tcp://*:5558"
- --kv-events-buffer-steps
- "10000"
ports:
- containerPort: 8000
protocol: TCP
name: api
# NEW: KV event ports
- containerPort: 5557
protocol: TCP
name: kv-events
- containerPort: 5558
protocol: TCP
name: kv-replay
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
# Health checks remain the same
livenessProbe:
httpGet:
path: /health
port: 8000
scheme: HTTP
failureThreshold: 3
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
httpGet:
path: /health
port: 8000
scheme: HTTP
failureThreshold: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
startupProbe:
httpGet:
path: /health
port: 8000
scheme: HTTP
failureThreshold: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1

---

apiVersion: v1
kind: Service
metadata:
labels:
model.aibrix.ai/name: deepseek-r1-distill-llama-8b
prometheus-discovery: "true"
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
name: deepseek-r1-distill-llama-8b
# Note: This sample uses 'default' namespace for quickstart purposes.
# To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
namespace: default
spec:
ports:
- name: serve
port: 8000
protocol: TCP
targetPort: 8000
- name: metrics
port: 8080
protocol: TCP
targetPort: 8080
# NEW: Expose KV event ports (optional, for debugging)
- name: kv-events
port: 5557
protocol: TCP
targetPort: 5557
- name: kv-replay
port: 5558
protocol: TCP
targetPort: 5558
selector:
model.aibrix.ai/name: deepseek-r1-distill-llama-8b
type: ClusterIP