Skip to content

Commit 0ad083d

Browse files
committed
Remove ElasticConfig from Service
Signed-off-by: kerthcet <[email protected]>
1 parent 590d58d commit 0ad083d

File tree

8 files changed

+196
-27
lines changed

8 files changed

+196
-27
lines changed

api/core/v1alpha1/model_types.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,13 @@ type FlavorName string
9898
type Flavor struct {
9999
// Name represents the flavor name, which will be used in model claim.
100100
Name FlavorName `json:"name"`
101-
// Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
102-
// When GPU number is greater than 8, like 32, then multi-host inference is enabled and
103-
// 32/8=4 hosts will be grouped as an unit, each host will have a resource request as
104-
// nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
105-
// Not recommended to set the cpu and memory usage here.
106-
// If using playground, you can define the cpu/mem usage at backendConfig.
107-
// If using service, you can define the cpu/mem at the container resources.
108-
// Note: if you define the same accelerator requests at playground/service as well,
101+
// Requests defines the required accelerators to serve the model for each replica,
102+
// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
103+
// the resource requirements for each replica. This may change in the future.
104+
// Not recommended to set the cpu and memory usage here:
105+
// - if using playground, you can define the cpu/mem usage at backendConfig.
106+
// - if using inference service, you can define the cpu/mem at the container resources.
107+
// However, if you define the same accelerator requests at playground/service as well,
109108
// the requests here will be covered.
110109
// +optional
111110
Requests v1.ResourceList `json:"requests,omitempty"`

api/inference/v1alpha1/service_types.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@ type ServiceSpec struct {
3535
// LWS supports both single-host and multi-host scenarios, for single host
3636
// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
3737
WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
38-
// ElasticConfig defines the configuration for elastic usage,
39-
// e.g. the max/min replicas. Default to 0 ~ Inf+.
40-
// This requires to install the HPA first or will not work.
41-
// +optional
42-
ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
4338
}
4439

4540
const (

api/inference/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/inference/v1alpha1/servicespec.go

Lines changed: 0 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
FROM docker.io/vllm/vllm-openai:v0.6.5
2+
COPY ray_init.sh /workspace/ray_init.sh
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
apiVersion: inference.llmaz.io/v1alpha1
2+
kind: Service
3+
metadata:
4+
name: llama3-405b-instruct
5+
spec:
6+
modelClaims:
7+
models:
8+
- name: llama3-405b-instruct
9+
workloadTemplate:
10+
replicas: 1
11+
leaderWorkerTemplate:
12+
size: 2
13+
restartPolicy: RecreateGroupOnPodRestart
14+
leaderTemplate:
15+
metadata:
16+
labels:
17+
role: leader
18+
spec:
19+
containers:
20+
- name: vllm-leader
21+
image: <image-built-from-dockerfile>
22+
command:
23+
- sh
24+
- -c
25+
- "/workspace/ray_init.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
26+
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
27+
resources:
28+
limits:
29+
nvidia.com/gpu: "8"
30+
memory: 1124Gi
31+
ephemeral-storage: 800Gi
32+
requests:
33+
ephemeral-storage: 800Gi
34+
cpu: 125
35+
ports:
36+
- containerPort: 8080
37+
readinessProbe:
38+
tcpSocket:
39+
port: 8080
40+
initialDelaySeconds: 15
41+
periodSeconds: 10
42+
volumeMounts:
43+
- mountPath: /dev/shm
44+
name: dshm
45+
volumes:
46+
- name: dshm
47+
emptyDir:
48+
medium: Memory
49+
sizeLimit: 15Gi
50+
workerTemplate:
51+
spec:
52+
containers:
53+
- name: vllm-worker
54+
image: <image-built-from-dockerfile>
55+
command:
56+
- sh
57+
- -c
58+
- "/workspace/ray_init.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
59+
resources:
60+
limits:
61+
nvidia.com/gpu: "8"
62+
memory: 1124Gi
63+
ephemeral-storage: 800Gi
64+
requests:
65+
ephemeral-storage: 800Gi
66+
cpu: 125
67+
env:
68+
- name: HUGGING_FACE_HUB_TOKEN
69+
value: <your-hf-token>
70+
volumeMounts:
71+
- mountPath: /dev/shm
72+
name: dshm
73+
volumes:
74+
- name: dshm
75+
emptyDir:
76+
medium: Memory
77+
sizeLimit: 15Gi
78+
backendRuntimeConfig:
79+
name: ollama
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: llmaz.io/v1alpha1
2+
kind: OpenModel
3+
metadata:
4+
name: llama3-405b-instruct
5+
spec:
6+
familyName: llama3
7+
source:
8+
modelHub:
9+
modelID: meta-llama/Meta-Llama-3.1-405B-Instruct
10+
inferenceFlavors:
11+
- name: t4 # GPU type
12+
requests:
13+
nvidia.com/gpu: 1
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/bin/bash
2+
3+
subcommand=$1
4+
shift
5+
6+
ray_port=6379
7+
ray_init_timeout=300
8+
9+
case "$subcommand" in
10+
worker)
11+
ray_address=""
12+
while [ $# -gt 0 ]; do
13+
case "$1" in
14+
--ray_address=*)
15+
ray_address="${1#*=}"
16+
;;
17+
--ray_port=*)
18+
ray_port="${1#*=}"
19+
;;
20+
--ray_init_timeout=*)
21+
ray_init_timeout="${1#*=}"
22+
;;
23+
*)
24+
echo "unknown argument: $1"
25+
exit 1
26+
esac
27+
shift
28+
done
29+
30+
if [ -z "$ray_address" ]; then
31+
echo "Error: Missing argument --ray_address"
32+
exit 1
33+
fi
34+
35+
for (( i=0; i < $ray_init_timeout; i+=5 )); do
36+
ray start --address=$ray_address:$ray_port --block
37+
if [ $? -eq 0 ]; then
38+
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
39+
exit 0
40+
fi
41+
echo "Waiting until the ray worker is active..."
42+
sleep 5s;
43+
done
44+
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
45+
exit 1
46+
;;
47+
48+
leader)
49+
ray_cluster_size=""
50+
while [ $# -gt 0 ]; do
51+
case "$1" in
52+
--ray_port=*)
53+
ray_port="${1#*=}"
54+
;;
55+
--ray_cluster_size=*)
56+
ray_cluster_size="${1#*=}"
57+
;;
58+
--ray_init_timeout=*)
59+
ray_init_timeout="${1#*=}"
60+
;;
61+
*)
62+
echo "unknown argument: $1"
63+
exit 1
64+
esac
65+
shift
66+
done
67+
68+
if [ -z "$ray_cluster_size" ]; then
69+
echo "Error: Missing argument --ray_cluster_size"
70+
exit 1
71+
fi
72+
73+
# start the ray daemon
74+
ray start --head --port=$ray_port
75+
76+
# wait until all workers are active
77+
for (( i=0; i < $ray_init_timeout; i+=5 )); do
78+
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
79+
if [ $active_nodes -eq $ray_cluster_size ]; then
80+
echo "All ray workers are active and the ray cluster is initialized successfully."
81+
exit 0
82+
fi
83+
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
84+
sleep 5s;
85+
done
86+
87+
echo "Waiting for all ray workers to be active timed out."
88+
exit 1
89+
;;
90+
91+
*)
92+
echo "unknown subcommand: $subcommand"
93+
exit 1
94+
;;
95+
esac

0 commit comments

Comments
 (0)