|
| 1 | +apiVersion: v1 |
| 2 | +kind: PersistentVolumeClaim |
| 3 | +metadata: |
| 4 | + name: llama-31-8b-sglang |
| 5 | +spec: |
| 6 | + accessModes: |
| 7 | + - ReadWriteMany |
| 8 | + resources: |
| 9 | + requests: |
| 10 | + storage: 30Gi |
| 11 | + storageClassName: default # change this to your preferred storage class |
| 12 | + volumeMode: Filesystem |
| 13 | +--- |
1 | 14 | apiVersion: node.k8s.io/v1
|
2 | 15 | kind: RuntimeClass
|
3 | 16 | metadata:
|
@@ -27,41 +40,64 @@ spec:
|
27 | 40 | containers:
|
28 | 41 | - name: meta-llama-31-8b-instruct-sglang
|
29 | 42 | image: docker.io/lmsysorg/sglang:latest
|
30 |
| - imagePullPolicy: Always # IfNotPresent or Never |
| 43 | + imagePullPolicy: Always # IfNotPresent or Never |
31 | 44 | ports:
|
32 | 45 | - containerPort: 30000
|
33 | 46 | command: ["python3", "-m", "sglang.launch_server"]
|
34 |
| - args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] |
| 47 | + args: |
| 48 | + [ |
| 49 | + "--model-path", |
| 50 | + "meta-llama/Llama-3.1-8B-Instruct", |
| 51 | + "--host", |
| 52 | + "0.0.0.0", |
| 53 | + "--port", |
| 54 | + "30000", |
| 55 | + ] |
35 | 56 | env:
|
36 | 57 | - name: HF_TOKEN
|
37 | 58 | value: <secret>
|
38 | 59 | resources:
|
39 | 60 | limits:
|
40 | 61 | nvidia.com/gpu: 1
|
| 62 | + cpu: 8 |
| 63 | + memory: 40Gi |
| 64 | + requests: |
| 65 | + cpu: 2 |
| 66 | + memory: 16Gi |
| 67 | + nvidia.com/gpu: 1 |
41 | 68 | volumeMounts:
|
42 | 69 | - name: shm
|
43 | 70 | mountPath: /dev/shm
|
44 | 71 | - name: hf-cache
|
45 | 72 | mountPath: /root/.cache/huggingface
|
46 |
| - readOnly: true |
47 | 73 | - name: localtime
|
48 | 74 | mountPath: /etc/localtime
|
49 | 75 | readOnly: true
|
50 | 76 | livenessProbe:
|
51 | 77 | httpGet:
|
52 | 78 | path: /health
|
53 | 79 | port: 30000
|
54 |
| - initialDelaySeconds: 30 |
55 |
| - periodSeconds: 10 |
| 80 | + initialDelaySeconds: 120 |
| 81 | + periodSeconds: 15 |
| 82 | + timeoutSeconds: 10 |
| 83 | + failureThreshold: 3 |
| 84 | + readinessProbe: |
| 85 | + httpGet: |
| 86 | + path: /health_generate |
| 87 | + port: 30000 |
| 88 | + initialDelaySeconds: 120 |
| 89 | + periodSeconds: 15 |
| 90 | + timeoutSeconds: 10 |
| 91 | + failureThreshold: 3 |
| 92 | + successThreshold: 1 |
56 | 93 | volumes:
|
57 | 94 | - name: shm
|
58 | 95 | emptyDir:
|
59 | 96 | medium: Memory
|
60 | 97 | sizeLimit: 10Gi
|
61 | 98 | - name: hf-cache
|
62 |
| - hostPath: |
63 |
| - path: /root/.cache/huggingface |
64 |
| - type: Directory |
| 99 | + persistentVolumeClaim: |
| 100 | + claimName: llama-31-8b-sglang |
65 | 101 | - name: localtime
|
66 | 102 | hostPath:
|
67 | 103 | path: /etc/localtime
|
|
76 | 112 | app: meta-llama-31-8b-instruct-sglang
|
77 | 113 | ports:
|
78 | 114 | - protocol: TCP
|
79 |
| - port: 30000 # port on host |
80 |
| - targetPort: 30000 # port in container |
81 |
| - type: LoadBalancer |
| 115 | + port: 80 # port on host |
| 116 | + targetPort: 30000 # port in container |
| 117 | + type: LoadBalancer # change to ClusterIP if needed |
0 commit comments