Skip to content

Commit 2aa85e8

Browse files
committed
ci: improve and simplify accelerate and transformers workloads
* Fix torch package version checks * Use `--dist loadfile` strategy in alignment with HF practice * Group together `pip install` commands * Add missing `pciutils` to accelerate workload * Drop `apt-get` loops as runners are now isolated test accelerate and transformers only disable_build disable_ut disable_e2e disable_distributed Signed-off-by: Dmitry Rogozhkin <[email protected]>
1 parent 24fab67 commit 2aa85e8

File tree

2 files changed

+59
-32
lines changed

2 files changed

+59
-32
lines changed

.github/workflows/_linux_accelerate.yml

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ permissions: read-all
4444
concurrency:
4545
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
4646
cancel-in-progress: true
47+
env:
48+
TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
4749

4850
defaults:
4951
run:
@@ -105,7 +107,10 @@ jobs:
105107
HF_HUB_DOWNLOAD_TIMEOUT: 120
106108
PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
107109
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
108-
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
110+
# Usage of `--dist loadfile` is a must as HF tests has complex setups including
111+
# setUpClass and @first_run clauses. So 'loadfile' stratagy allows to minimize
112+
# race conditions scope.
113+
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist loadfile ${{ needs.prepare.outputs.pytest_extra_args }}
109114
env:
110115
accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }}
111116
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }}
@@ -125,16 +130,28 @@ jobs:
125130
uses: actions/setup-python@v5
126131
with:
127132
python-version: ${{ env.python }}
133+
- name: Prepare environment
134+
run: |
135+
sudo apt-get update
136+
# pciutils is needed to report available GPUs (we use lspci)
137+
# python3-dev is needed for torch inductor and extension compilations
138+
sudo apt-get install -y --no-install-recommends pciutils python3-dev
128139
- name: Check python
129140
run: |
130141
which python && python -V
131142
which pip && pip list
132143
pip install -U pip wheel setuptools
133144
- name: Install pytorch and deps
134145
run: |
135-
pip install junitparser
136-
pip install transformers==${{ env.transformers }}
137-
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
146+
pip install $TORCH_INDEX torch torchvision torchaudio
147+
# Do NOT install HF transformers or accelerate before torch as we need
148+
# very specific version of the torch and HF would bring its own.
149+
pip install \
150+
junitparser \
151+
pytest \
152+
pytest-timeout \
153+
pytest-xdist \
154+
transformers==${{ env.transformers }}
138155
- name: Prepare Accelerate
139156
run: |
140157
cd $WORK_DIR
@@ -155,8 +172,6 @@ jobs:
155172
xpu-smi discovery -y --json --dump -1
156173
- name: Sanity check installed packages
157174
run: |
158-
# Use latest pytest
159-
pip install -U pytest pytest-timeout pytest-xdist
160175
# These checks are to exit earlier if for any reason torch
161176
# packages were reinstalled back to CUDA versions (not expected).
162177
pip show torch | grep Version | grep xpu

.github/workflows/_linux_transformers.yml

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ env:
7474
libswresample-dev
7575
libswscale-dev
7676
pciutils
77+
python3-dev
7778
TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
7879
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
7980

@@ -116,20 +117,36 @@ jobs:
116117
render_id: ${{ steps.runner-info.outputs.render_id }}
117118
hostname: ${{ steps.runner-info.outputs.hostname }}
118119
pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
120+
env:
121+
VIRTUAL_ENV: ${{ github.workspace }}/.venv
119122
steps:
123+
- name: Install uv and python-${{ env.python }}
124+
uses: astral-sh/setup-uv@v6
125+
with:
126+
python-version: ${{ env.python }}
127+
- name: Prepare environment
128+
run: |
129+
rm -rf ${{ env.VIRTUAL_ENV }}
130+
uv venv ${{ env.VIRTUAL_ENV }}
120131
- id: getver
121132
run: |
122133
# We can't just `pip index version...` and get the last available
123134
# version as pytorch packages may have tricky dependencies. Instead
124-
# we dry run install packages and get versions which would be installed.
135+
# we install packages and get versions which got installed. Note that
136+
# trying to --dry-run is not actually reliable as it does not make
137+
# the thorough check of package dependencies.
125138
# See: https://github.com/pytorch/pytorch/issues/154687
126-
pip install --dry-run --ignore-installed $TORCH_INDEX \
139+
uv pip install $TORCH_INDEX \
127140
torch torchvision torchaudio pytorch-triton-xpu >_log.txt
128141
129-
torch=$(cat _log.txt | grep "Would install" | sed -E "s/.*torch-([^ ]*).*/\1/")
130-
torchvision=$(cat _log.txt | grep "Would install" | sed -E "s/.*torchvision-([^ ]*).*/\1/")
131-
torchaudio=$(cat _log.txt | grep "Would install" | sed -E "s/.*torchaudio-([^ ]*).*/\1/")
132-
triton=$(cat _log.txt | grep "Would install" | sed -E "s/.*pytorch-triton-xpu-([^ ]*).*/\1/")
142+
torch=$(uv pip show torch | grep Version)
143+
torchvision=$(uv pip show torchvision | grep Version)
144+
torchaudio=$(uv pip show torchaudio | grep Version)
145+
triton=$(uv pip show pytorch-triton-xpu | grep Version)
146+
torch=${torch#Version: *}
147+
torchvision=${torchvision#Version: *}
148+
torchaudio=${torchaudio#Version: *}
149+
triton=${triton#Version: *}
133150
echo "torch=$torch" | tee -a "$GITHUB_OUTPUT"
134151
echo "torchvision=$torchvision" | tee -a "$GITHUB_OUTPUT"
135152
echo "torchaudio=$torchaudio" | tee -a "$GITHUB_OUTPUT"
@@ -154,8 +171,12 @@ jobs:
154171
env:
155172
PYTORCH_DEBUG_XPU_FALLBACK: '1'
156173
TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
157-
# enable pytest parallel run, and continue others if meets crash case such as segmentation fault
158-
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
174+
# Usage of `--dist loadfile` is a must as HF tests has complex setups including
175+
# setUpClass and @first_run clauses. So 'loadfile' stratagy allows to minimize
176+
# race conditions scope. Besides, that's how HF Transformers recommend to run
177+
# tests and how they run them in their own CI.
178+
# See: https://github.com/huggingface/transformers/blob/v4.56.2/CONTRIBUTING.md?plain=1#L312
179+
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist loadfile ${{ needs.prepare.outputs.pytest_extra_args }}
159180
strategy:
160181
fail-fast: false
161182
max-parallel: 1
@@ -224,21 +245,9 @@ jobs:
224245
fi
225246
- name: Prepare OS environment
226247
run: |
227-
# as jobs might run in parallel on the same system, apt-get might
228-
# step into the lock hold by other job
229-
start_time=$SECONDS
230-
while ! sudo apt-get update; do
231-
sleep 1;
232-
if (( $SECONDS - start_time > 60 )); then false; fi
233-
done
234-
while ! sudo apt-get install -y $PACKAGES; do
235-
sleep 1;
236-
if (( $SECONDS - start_time > 60 )); then false; fi
237-
done
238-
while ! git lfs install; do
239-
sleep 1;
240-
if (( $SECONDS - start_time > 60 )); then false; fi
241-
done
248+
sudo apt-get update
249+
sudo apt-get install -y $PACKAGES
250+
git lfs install
242251
- name: Setup python-${{ env.python }}
243252
uses: actions/setup-python@v5
244253
with:
@@ -250,12 +259,17 @@ jobs:
250259
pip install -U pip wheel setuptools
251260
- name: Prepare pytorch and deps
252261
run: |
253-
pip install junitparser
254262
pip install $TORCH_INDEX \
255263
torch==${{ needs.prepare.outputs.torch }} \
256264
torchvision==${{ needs.prepare.outputs.torchvision }} \
257265
torchaudio==${{ needs.prepare.outputs.torchaudio }} \
258266
pytorch-triton-xpu==${{needs.prepare.outputs.triton }}
267+
pip install \
268+
junitparser \
269+
pytest \
270+
pytest-timeout \
271+
pytest-xdist \
272+
pytest-shard
259273
- name: Prepare Transformers
260274
run: |
261275
pwd
@@ -281,8 +295,6 @@ jobs:
281295
xpu-smi discovery -y --json --dump -1
282296
- name: Sanity check installed packages
283297
run: |
284-
# Use latest pytest
285-
pip install -U pytest pytest-timeout pytest-xdist pytest-shard
286298
# These checks are to exit earlier if for any reason Transformers
287299
# reinstalled torch packages back to CUDA versions (not expected).
288300
pip show torch | grep Version | grep xpu

0 commit comments

Comments
 (0)