Remove ElasticConfig from Service

kerthcet · kerthcet · commit 0ad083dcb5ed · 2024-12-26T14:53:20.000+08:00
Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;
diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
@@ -98,14 +98,13 @@ type FlavorName string
 type Flavor struct {
 	// Name represents the flavor name, which will be used in model claim.
 	Name FlavorName `json:"name"`
-	// Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-	// When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-	// 32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-	// nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-	// Not recommended to set the cpu and memory usage here.
-	// If using playground, you can define the cpu/mem usage at backendConfig.
-	// If using service, you can define the cpu/mem at the container resources.
-	// Note: if you define the same accelerator requests at playground/service as well,
+	// Requests defines the required accelerators to serve the model for each replica,
+	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+	// the resource requirements for each replica. This may change in the future.
+	// Not recommended to set the cpu and memory usage here:
+	// - if using playground, you can define the cpu/mem usage at backendConfig.
+	// - if using inference service, you can define the cpu/mem at the container resources.
+	// However, if you define the same accelerator requests at playground/service as well,
 	// the requests here will be covered.
 	// +optional
 	Requests v1.ResourceList `json:"requests,omitempty"`
diff --git a/api/inference/v1alpha1/service_types.go b/api/inference/v1alpha1/service_types.go
@@ -35,11 +35,6 @@ type ServiceSpec struct {
 	// LWS supports both single-host and multi-host scenarios, for single host
 	// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
 	WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
-	// ElasticConfig defines the configuration for elastic usage,
-	// e.g. the max/min replicas. Default to 0 ~ Inf+.
-	// This requires to install the HPA first or will not work.
-	// +optional
-	ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
 }
 
 const (
diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go
diff --git a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go
diff --git a/docs/examples/multi-hosts/Dockerfile b/docs/examples/multi-hosts/Dockerfile
@@ -0,0 +1,2 @@
+FROM docker.io/vllm/vllm-openai:v0.6.5
+COPY ray_init.sh /workspace/ray_init.sh
diff --git a/docs/examples/multi-hosts/inference_service.yaml b/docs/examples/multi-hosts/inference_service.yaml
@@ -0,0 +1,79 @@
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Service
+metadata:
+  name: llama3-405b-instruct
+spec:
+  modelClaims:
+    models:
+      - name: llama3-405b-instruct
+  workloadTemplate:
+    replicas: 1
+    leaderWorkerTemplate:
+      size: 2
+      restartPolicy: RecreateGroupOnPodRestart
+      leaderTemplate:
+        metadata:
+          labels:
+            role: leader
+        spec:
+          containers:
+            - name: vllm-leader
+              image: <image-built-from-dockerfile>
+              command:
+                - sh
+                - -c
+                - "/workspace/ray_init.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
+                  python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+              resources:
+                limits:
+                  nvidia.com/gpu: "8"
+                  memory: 1124Gi
+                  ephemeral-storage: 800Gi
+                requests:
+                  ephemeral-storage: 800Gi
+                  cpu: 125
+              ports:
+                - containerPort: 8080
+              readinessProbe:
+                tcpSocket:
+                  port: 8080
+                initialDelaySeconds: 15
+                periodSeconds: 10
+              volumeMounts:
+                - mountPath: /dev/shm
+                  name: dshm
+          volumes:
+          - name: dshm
+            emptyDir:
+              medium: Memory
+              sizeLimit: 15Gi
+      workerTemplate:
+        spec:
+          containers:
+            - name: vllm-worker
+              image: <image-built-from-dockerfile>
+              command:
+                - sh
+                - -c
+                - "/workspace/ray_init.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+              resources:
+                limits:
+                  nvidia.com/gpu: "8"
+                  memory: 1124Gi
+                  ephemeral-storage: 800Gi
+                requests:
+                  ephemeral-storage: 800Gi
+                  cpu: 125
+              env:
+                - name: HUGGING_FACE_HUB_TOKEN
+                  value: <your-hf-token>
+              volumeMounts:
+                - mountPath: /dev/shm
+                  name: dshm
+          volumes:
+          - name: dshm
+            emptyDir:
+              medium: Memory
+              sizeLimit: 15Gi
+  backendRuntimeConfig:
+    name: ollama
diff --git a/docs/examples/multi-hosts/model.yaml b/docs/examples/multi-hosts/model.yaml
@@ -0,0 +1,13 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: llama3-405b-instruct
+spec:
+  familyName: llama3
+  source:
+    modelHub:
+      modelID: meta-llama/Meta-Llama-3.1-405B-Instruct
+  inferenceFlavors:
+  - name: t4 # GPU type
+    requests:
+      nvidia.com/gpu: 1
diff --git a/docs/examples/multi-hosts/ray_init.sh b/docs/examples/multi-hosts/ray_init.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+subcommand=$1
+shift
+
+ray_port=6379
+ray_init_timeout=300
+
+case "$subcommand" in
+  worker)
+    ray_address=""
+    while [ $# -gt 0 ]; do
+      case "$1" in
+        --ray_address=*)
+          ray_address="${1#*=}"
+          ;;
+        --ray_port=*)
+          ray_port="${1#*=}"
+          ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
+        *)
+          echo "unknown argument: $1"
+          exit 1
+      esac
+      shift
+    done
+
+    if [ -z "$ray_address" ]; then
+      echo "Error: Missing argument --ray_address"
+      exit 1
+    fi
+
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+      ray start --address=$ray_address:$ray_port --block
+      if [ $? -eq 0 ]; then
+        echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
+        exit 0
+      fi
+      echo "Waiting until the ray worker is active..."
+      sleep 5s;
+    done
+    echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
+    exit 1
+    ;;
+
+  leader)
+    ray_cluster_size=""
+    while [ $# -gt 0 ]; do
+          case "$1" in
+            --ray_port=*)
+              ray_port="${1#*=}"
+              ;;
+            --ray_cluster_size=*)
+              ray_cluster_size="${1#*=}"
+              ;;
+            --ray_init_timeout=*)
+              ray_init_timeout="${1#*=}"
+              ;;
+            *)
+              echo "unknown argument: $1"
+              exit 1
+          esac
+          shift
+    done
+
+    if [ -z "$ray_cluster_size" ]; then
+      echo "Error: Missing argument --ray_cluster_size"
+      exit 1
+    fi
+
+    # start the ray daemon
+    ray start --head --port=$ray_port
+
+    # wait until all workers are active
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+        active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+        if [ $active_nodes -eq $ray_cluster_size ]; then
+          echo "All ray workers are active and the ray cluster is initialized successfully."
+          exit 0
+        fi
+        echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
+        sleep 5s;
+    done
+
+    echo "Waiting for all ray workers to be active timed out."
+    exit 1
+    ;;
+
+  *)
+    echo "unknown subcommand: $subcommand"
+    exit 1
+    ;;
+esac

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+FROM docker.io/vllm/vllm-openai:v0.6.5`
	`2`	`+COPY ray_init.sh /workspace/ray_init.sh`