Skip to content

Commit c5009f3

Browse files
committed
add distributed UT xdist
1 parent 5ab5068 commit c5009f3

File tree

3 files changed

+69
-39
lines changed

3 files changed

+69
-39
lines changed

.github/actions/get-runner/action.yml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
name: Get Runner Infos
22

3+
inputs:
4+
ut_name:
5+
required: true
6+
type: string
7+
description: Which ut to launch
8+
39
outputs:
410
runner_id:
511
value: ${{ steps.runner.outputs.runner_id }}
@@ -21,6 +27,7 @@ runs:
2127
steps:
2228
- name: Get runner
2329
shell: bash -xe {0}
30+
if: ${{ ! contains(inputs.ut_name, 'distributed') }}
2431
id: runner
2532
run: |
2633
# get test runner
@@ -76,6 +83,64 @@ runs:
7683
echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
7784
echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
7885
echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}
86+
- name: Get distributed runner
87+
shell: bash -xe {0}
88+
if: ${{ contains(inputs.ut_name, 'distributed') }}
89+
id: runner-dist
90+
run: |
91+
# get test runner
92+
echo "runner_id=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT}
93+
echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT}
94+
echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
95+
echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
96+
# show host info
97+
lscpu
98+
lshw -C display
99+
free -h
100+
df -h
101+
cat /etc/os-release
102+
uname -a
103+
# clinfo hang and reboot system to recover
104+
timeout 120 clinfo --list || sudo reboot
105+
scaling_governor=$(cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq)
106+
if [ $(sudo -n true > /dev/null 2>&1 && echo $? || echo $?) -eq 0 ];then
107+
if [ "${scaling_governor}" != "performance" ];then
108+
# set frequency governor to performance mode
109+
sudo apt-get update
110+
sudo apt-get install -y linux-tools-common linux-tools-$(uname -r) linux-cloud-tools-$(uname -r)
111+
sudo cpupower set -b 0
112+
sudo cpupower frequency-set -g performance
113+
fi
114+
# clean cache
115+
sync; sudo sh -c "echo 3 > /proc/sys/vm/drop_caches" || true
116+
else
117+
echo "[INFO] You do NOT have ROOT permission to set system config."
118+
echo " The frequency governor is ${scaling_governor}."
119+
fi
120+
cpu_num="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk 'BEGIN{sum=1}{sum*=$NF}END{printf sum}')"
121+
xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
122+
if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
123+
}' |wc -l)"
124+
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
125+
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
126+
if (x > 0) {
127+
split(z, xpu_list, ",");
128+
for (i=0;i<x;i=i+4) {
129+
if (z != "") {
130+
ze = xpu_list[i+1];
131+
} else {
132+
ze = i;
133+
}
134+
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d,%d,%d,%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
135+
ze,ze+1,ze+2,ze+3,4*cx,i*cx,(i+4)*cx-1);
136+
}
137+
}else {
138+
printf(" -n 1 ");
139+
}
140+
}')"
141+
echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
142+
echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
143+
echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}
79144
- name: Cleanup host
80145
shell: bash -xe {0}
81146
run: |

.github/workflows/_linux_ut.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ env:
3838
jobs:
3939
runner:
4040
runs-on: ${{ inputs.runner }}
41-
name: get-runner
41+
name: get-runner
4242
outputs:
4343
runner_id: ${{ steps.runner-info.outputs.runner_id }}
4444
user_id: ${{ steps.runner-info.outputs.user_id }}
@@ -53,6 +53,8 @@ jobs:
5353
uses: actions/checkout@v4
5454
- name: Get runner
5555
id: runner-info
56+
with:
57+
ut_name: ${{ inputs.ut }}
5658
uses: ./.github/actions/get-runner
5759

5860
test-in-container:
@@ -104,7 +106,7 @@ jobs:
104106
runs-on: ${{ needs.runner.outputs.runner_id }}
105107
env:
106108
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
107-
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
109+
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread --dist worksteal ${{ needs.runner.outputs.pytest_extra_args }}
108110
steps:
109111
- name: Checkout torch-xpu-ops
110112
uses: actions/checkout@v4

test/xpu/run_distributed.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import os
21
import subprocess
32
import sys
43

@@ -9,42 +8,6 @@
98
res2 = 0
109
fail_test = []
1110

12-
# Get the xelink group card affinity
13-
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
14-
if ret == 0:
15-
gpu_dict = {}
16-
with open("topology.log") as file:
17-
lines = file.readlines()
18-
for line in lines:
19-
if "CPU Affinity" in line:
20-
continue
21-
line = line.strip()
22-
if line.startswith("GPU "):
23-
items = line.split(" ")
24-
items = [x for x in items if x]
25-
gpu_id = items[1]
26-
i = gpu_id.split("/")[0]
27-
affinity = ""
28-
for j, item in enumerate(items):
29-
if "SYS" not in item and ("XL" in item or "S" in item):
30-
if len(affinity) == 0:
31-
affinity = str(j - 2)
32-
else:
33-
affinity = affinity + "," + str(j - 2)
34-
gpu_dict[i] = affinity
35-
36-
max_affinity = ""
37-
for key, value in gpu_dict.items():
38-
if len(value) > len(max_affinity):
39-
max_affinity = value
40-
41-
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
42-
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
43-
44-
else:
45-
print("xpu-smi topology failed")
46-
sys.exit(255)
47-
4811

4912
# run python test
5013
def run(test_command):

0 commit comments

Comments
 (0)