Skip to content

Commit 3269bd9

Browse files
haitwang-cloudssssnow
authored andcommitted
Add PVC and update resource limits in k8s config (#8489)
1 parent 7302e2f commit 3269bd9

File tree

1 file changed

+47
-11
lines changed

1 file changed

+47
-11
lines changed

docker/k8s-sglang-service.yaml

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: llama-31-8b-sglang
5+
spec:
6+
accessModes:
7+
- ReadWriteMany
8+
resources:
9+
requests:
10+
storage: 30Gi
11+
storageClassName: default # change this to your preferred storage class
12+
volumeMode: Filesystem
13+
---
114
apiVersion: node.k8s.io/v1
215
kind: RuntimeClass
316
metadata:
@@ -27,41 +40,64 @@ spec:
2740
containers:
2841
- name: meta-llama-31-8b-instruct-sglang
2942
image: docker.io/lmsysorg/sglang:latest
30-
imagePullPolicy: Always # IfNotPresent or Never
43+
imagePullPolicy: Always # IfNotPresent or Never
3144
ports:
3245
- containerPort: 30000
3346
command: ["python3", "-m", "sglang.launch_server"]
34-
args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
47+
args:
48+
[
49+
"--model-path",
50+
"meta-llama/Llama-3.1-8B-Instruct",
51+
"--host",
52+
"0.0.0.0",
53+
"--port",
54+
"30000",
55+
]
3556
env:
3657
- name: HF_TOKEN
3758
value: <secret>
3859
resources:
3960
limits:
4061
nvidia.com/gpu: 1
62+
cpu: 8
63+
memory: 40Gi
64+
requests:
65+
cpu: 2
66+
memory: 16Gi
67+
nvidia.com/gpu: 1
4168
volumeMounts:
4269
- name: shm
4370
mountPath: /dev/shm
4471
- name: hf-cache
4572
mountPath: /root/.cache/huggingface
46-
readOnly: true
4773
- name: localtime
4874
mountPath: /etc/localtime
4975
readOnly: true
5076
livenessProbe:
5177
httpGet:
5278
path: /health
5379
port: 30000
54-
initialDelaySeconds: 30
55-
periodSeconds: 10
80+
initialDelaySeconds: 120
81+
periodSeconds: 15
82+
timeoutSeconds: 10
83+
failureThreshold: 3
84+
readinessProbe:
85+
httpGet:
86+
path: /health_generate
87+
port: 30000
88+
initialDelaySeconds: 120
89+
periodSeconds: 15
90+
timeoutSeconds: 10
91+
failureThreshold: 3
92+
successThreshold: 1
5693
volumes:
5794
- name: shm
5895
emptyDir:
5996
medium: Memory
6097
sizeLimit: 10Gi
6198
- name: hf-cache
62-
hostPath:
63-
path: /root/.cache/huggingface
64-
type: Directory
99+
persistentVolumeClaim:
100+
claimName: llama-31-8b-sglang
65101
- name: localtime
66102
hostPath:
67103
path: /etc/localtime
@@ -76,6 +112,6 @@ spec:
76112
app: meta-llama-31-8b-instruct-sglang
77113
ports:
78114
- protocol: TCP
79-
port: 30000 # port on host
80-
targetPort: 30000 # port in container
81-
type: LoadBalancer
115+
port: 80 # port on host
116+
targetPort: 30000 # port in container
117+
type: LoadBalancer # change to ClusterIP if needed

0 commit comments

Comments
 (0)