Skip to content

Commit b9d8b96

Browse files
committed
ci: improve and simplify accelerate and transformers workloads
* Use `--dist loadfile` strategy in alignment with HF practice * Group together `pip install` commands * Add missing `pciutils` to accelerate workload * Drop `apt-get` loops as runners are now isolated test accelerate and transformers only disable_build disable_ut disable_e2e disable_distributed Signed-off-by: Dmitry Rogozhkin <[email protected]>
1 parent 24fab67 commit b9d8b96

File tree

2 files changed

+37
-26
lines changed

2 files changed

+37
-26
lines changed

.github/workflows/_linux_accelerate.yml

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ permissions: read-all
4444
concurrency:
4545
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
4646
cancel-in-progress: true
47+
env:
48+
TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
4749

4850
defaults:
4951
run:
@@ -105,7 +107,10 @@ jobs:
105107
HF_HUB_DOWNLOAD_TIMEOUT: 120
106108
PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
107109
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
108-
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
110+
# Usage of `--dist loadfile` is a must as HF tests has complex setups including
111+
# setUpClass and @first_run clauses. So 'loadfile' stratagy allows to minimize
112+
# race conditions scope.
113+
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist loadfile ${{ needs.prepare.outputs.pytest_extra_args }}
109114
env:
110115
accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }}
111116
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }}
@@ -125,16 +130,28 @@ jobs:
125130
uses: actions/setup-python@v5
126131
with:
127132
python-version: ${{ env.python }}
133+
- name: Prepare environment
134+
run: |
135+
sudo apt-get update
136+
# pciutils is needed to report available GPUs (we use lspci)
137+
# python3-dev is needed for torch inductor and extension compilations
138+
sudo apt-get install -y --no-install-recommends pciutils python3-dev
128139
- name: Check python
129140
run: |
130141
which python && python -V
131142
which pip && pip list
132143
pip install -U pip wheel setuptools
133144
- name: Install pytorch and deps
134145
run: |
135-
pip install junitparser
136-
pip install transformers==${{ env.transformers }}
137-
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
146+
pip install $TORCH_INDEX torch torchvision torchaudio
147+
# Do NOT install HF transformers or accelerate before torch as we need
148+
# very specific version of the torch and HF would bring its own.
149+
pip install \
150+
junitparser \
151+
pytest \
152+
pytest-timeout \
153+
pytest-xdist \
154+
transformers==${{ env.transformers }}
138155
- name: Prepare Accelerate
139156
run: |
140157
cd $WORK_DIR
@@ -155,8 +172,6 @@ jobs:
155172
xpu-smi discovery -y --json --dump -1
156173
- name: Sanity check installed packages
157174
run: |
158-
# Use latest pytest
159-
pip install -U pytest pytest-timeout pytest-xdist
160175
# These checks are to exit earlier if for any reason torch
161176
# packages were reinstalled back to CUDA versions (not expected).
162177
pip show torch | grep Version | grep xpu

.github/workflows/_linux_transformers.yml

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ env:
7474
libswresample-dev
7575
libswscale-dev
7676
pciutils
77+
python3-dev
7778
TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
7879
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
7980

@@ -154,8 +155,12 @@ jobs:
154155
env:
155156
PYTORCH_DEBUG_XPU_FALLBACK: '1'
156157
TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
157-
# enable pytest parallel run, and continue others if meets crash case such as segmentation fault
158-
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
158+
# Usage of `--dist loadfile` is a must as HF tests has complex setups including
159+
# setUpClass and @first_run clauses. So 'loadfile' stratagy allows to minimize
160+
# race conditions scope. Besides, that's how HF Transformers recommend to run
161+
# tests and how they run them in their own CI.
162+
# See: https://github.com/huggingface/transformers/blob/v4.56.2/CONTRIBUTING.md?plain=1#L312
163+
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist loadfile ${{ needs.prepare.outputs.pytest_extra_args }}
159164
strategy:
160165
fail-fast: false
161166
max-parallel: 1
@@ -224,21 +229,9 @@ jobs:
224229
fi
225230
- name: Prepare OS environment
226231
run: |
227-
# as jobs might run in parallel on the same system, apt-get might
228-
# step into the lock hold by other job
229-
start_time=$SECONDS
230-
while ! sudo apt-get update; do
231-
sleep 1;
232-
if (( $SECONDS - start_time > 60 )); then false; fi
233-
done
234-
while ! sudo apt-get install -y $PACKAGES; do
235-
sleep 1;
236-
if (( $SECONDS - start_time > 60 )); then false; fi
237-
done
238-
while ! git lfs install; do
239-
sleep 1;
240-
if (( $SECONDS - start_time > 60 )); then false; fi
241-
done
232+
sudo apt-get update
233+
sudo apt-get install -y $PACKAGES
234+
git lfs install
242235
- name: Setup python-${{ env.python }}
243236
uses: actions/setup-python@v5
244237
with:
@@ -250,12 +243,17 @@ jobs:
250243
pip install -U pip wheel setuptools
251244
- name: Prepare pytorch and deps
252245
run: |
253-
pip install junitparser
254246
pip install $TORCH_INDEX \
255247
torch==${{ needs.prepare.outputs.torch }} \
256248
torchvision==${{ needs.prepare.outputs.torchvision }} \
257249
torchaudio==${{ needs.prepare.outputs.torchaudio }} \
258250
pytorch-triton-xpu==${{needs.prepare.outputs.triton }}
251+
pip install \
252+
junitparser \
253+
pytest \
254+
pytest-timeout \
255+
pytest-xdist \
256+
pytest-shard
259257
- name: Prepare Transformers
260258
run: |
261259
pwd
@@ -281,8 +279,6 @@ jobs:
281279
xpu-smi discovery -y --json --dump -1
282280
- name: Sanity check installed packages
283281
run: |
284-
# Use latest pytest
285-
pip install -U pytest pytest-timeout pytest-xdist pytest-shard
286282
# These checks are to exit earlier if for any reason Transformers
287283
# reinstalled torch packages back to CUDA versions (not expected).
288284
pip show torch | grep Version | grep xpu

0 commit comments

Comments
 (0)