Skip to content

E2E Windows, accuracy, huggingface, inference, float16 #131

E2E Windows, accuracy, huggingface, inference, float16

E2E Windows, accuracy, huggingface, inference, float16 #131

Workflow file for this run

name: E2E on Windows
run-name: ${{ inputs.run_name }}
on:
workflow_dispatch:
inputs:
test_mode:
description: accuracy or performance
type: string
default: accuracy
suite:
description: Test suite
type: choice
options:
- all
- huggingface
- timm_models
- torchbench
default: all
mode:
description: Inference, inference-with-freezing, or training
type: choice
options:
- all
- inference
- inference-with-freezing
- training
default: all
dtype:
description: Data type
type: choice
options:
- all
- amp_bf16
- amp_fp16
- bfloat16
- float16
- float32
default: all
models:
description: Run all models or a subset from .github/models/{mode}/{suite}.txt
type: choice
options:
- all
- subset
default: subset
check_all_subset_models:
description: In "subset" mode, keep going after errors
type: boolean
default: false
only_one_model:
description: Run only this one model
type: string
default: ""
runner_label:
description: Runner label
type: string
default: "b580"
TORCH_COMPILE_DEBUG:
description: TORCH_COMPILE_DEBUG
type: string
default: ""
run_name:
description: Custom run name
type: string
default: "E2E on Windows"
permissions: read-all
env:
PYTHONIOENCODING: utf-8
NEW_WORKSPACE: C:\gh${{ github.run_id }}
TRITON_DISABLE_LINE_INFO: 1
PYTHON_VERSION: "3.10"
BENCHMARK_REPO: pytorch/benchmark
jobs:
tests:
name: Tests
runs-on:
- windows
- ${{ inputs.runner_label }}
timeout-minutes: 1440 # 24h
strategy:
fail-fast: false
steps:
- name: Print inputs
shell: bash
run: |
cat <<EOF
${{ toJSON(github.event.inputs) }}
EOF
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Clean up old workspaces
shell: bash
run: |
rm -rf /c/gh*
# Copy workspace to a temporary location with a shorter name.
- name: Copy workspace
run: |
Copy-Item -Path ${{ github.workspace }} -Destination ${{ env.NEW_WORKSPACE }} -Recurse
- name: Create venv
run:
python -m venv .venv
- name: Install PyTorch (source)
run: |
.venv\Scripts\activate.ps1
Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
# Required to build on Windows
$env:CMAKE_SHARED_LINKER_FLAGS = "/FORCE:MULTIPLE"
$env:CMAKE_MODULE_LINKER_FLAGS = "/FORCE:MULTIPLE"
$env:CMAKE_EXE_LINKER_FLAGS = "/FORCE:MULTIPLE"
$env:TORCH_XPU_ARCH_LIST = "bmg,dg2,arl-h,mtl-h"
bash -c "PYTORCH_PROJ=/c/pytorch ./scripts/install-pytorch.sh --source --check-wheel"
- name: PyTorch version
run: |
.venv\Scripts\activate.ps1
Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
python -c 'import torch; print(torch.__version__)' | Tee-Object -Variable PYTORCH_VERSION
echo "PYTORCH_VERSION=$PYTORCH_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
- name: Clean up Triton cache
shell: bash
run: |
rm -rf ~/.triton/cache
# We need ninja >= 1.12.0 to support long names on Windows. At the moment there is no required
# version in pypi, so instead of installing ninja with pip we use a preinstalled 1.12.1 on the
# runner.
- name: Setup Triton
run: |
.venv\Scripts\activate.ps1
Invoke-BatchFile "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
cd ${{ env.NEW_WORKSPACE }}
pip install -U wheel pybind11 cython cmake
pip install -v '.[build,tests,tutorials]'
- name: Triton version
run: |
.venv\Scripts\activate.ps1
Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
python -c 'import triton; print(triton.__version__)'
- name: Identify pinned versions
shell: bash
run: |
cd /c/pytorch
echo "BENCHMARK_COMMIT_ID=$(<.ci/docker/ci_commit_pins/torchbench.txt)" | tee -a "$GITHUB_ENV"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" | tee -a "$GITHUB_ENV"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" | tee -a "$GITHUB_ENV"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" | tee -a "$GITHUB_ENV"
- name: Install python test dependencies
run: |
.venv\Scripts\activate.ps1
pip install pyyaml pandas scipy 'numpy==1.26.4' psutil pyre_extensions torchrec
- name: Install transformers package
if: inputs.suite == 'all' || inputs.suite == 'huggingface'
run: |
cd pytorch
pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
- name: Install torchvision package
if: inputs.suite == 'all' || inputs.suite == 'timm_models' || inputs.suite == 'torchbench'
env:
DISTUTILS_USE_SDK: '1'
uses: ./.github/actions/install-dependency
with:
package: torchvision
repository: pytorch/vision
ref: ${{ env.TORCHVISION_COMMIT_ID }}
extra-cache-key: ${{ env.PYTORCH_VERSION }}
workspace: /c/gh${{ github.run_id }}
- name: Install torchaudio package
if: inputs.suite == 'all' || inputs.suite == 'torchbench'
uses: ./.github/actions/install-dependency
with:
package: torchaudio
repository: pytorch/audio
ref: ${{ env.TORCHAUDIO_COMMIT_ID }}
extra-cache-key: ${{ env.PYTORCH_VERSION }}
workspace: /c/gh${{ github.run_id }}
- name: Install timm package
if: inputs.suite == 'all' || inputs.suite == 'timm_models' || inputs.suite == 'torchbench'
uses: ./.github/actions/install-dependency
with:
package: timm
repository: huggingface/pytorch-image-models
ref: ${{ env.TIMM_COMMIT_ID }}
extra-cache-key: ${{ env.PYTORCH_VERSION }}
workspace: /c/gh${{ github.run_id }}
- name: Clone pytorch benchmark
if: inputs.suite == 'all' || inputs.suite == 'torchbench'
uses: actions/checkout@v4
with:
repository: ${{ env.BENCHMARK_REPO }}
ref: ${{ env.BENCHMARK_COMMIT_ID }}
submodules: recursive
path: benchmark
- name: Install pytorch benchmark
if: inputs.suite == 'all' || inputs.suite == 'torchbench'
run: |
.venv\Scripts\activate.ps1
Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
bash -c '
cd benchmark
if [[ "${{ inputs.only_one_model }}" ]]; then
python install.py "${{ inputs.only_one_model }}"
else
# install all models
python install.py
fi
pip install -e .
'
- name: Run e2e tests
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
run: |
.venv\Scripts\activate.ps1
Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
bash -c '
cd /c/pytorch
export WORKSPACE="/c/gh${{ github.run_id }}"
if [[ "${{ inputs.TORCH_COMPILE_DEBUG }}" = "1" ]] ; then
export TORCH_COMPILE_DEBUG="1"
# torch will save debug logs to $TORCH_COMPILE_DEBUG_DIR/torch_compile_debug
export TORCH_COMPILE_DEBUG_DIR=$WORKSPACE
fi
if [[ "${{ inputs.suite }}" = "all" ]]; then
suites=("huggingface" "timm_models" "torchbench")
else
suites=("${{ inputs.suite }}")
fi
if [[ "${{ inputs.mode }}" = "all" ]]; then
modes=("inference" "inference-with-freezing" "training")
else
modes=("${{ inputs.mode }}")
fi
if [[ "${{ inputs.dtype }}" = "all" ]]; then
dtypes=("amp_bf16" "amp_fp16" "bfloat16" "float16" "float32")
else
dtypes=("${{ inputs.dtype }}")
fi
# if "only_one_model" is set, then test this model
# if "models" == "subset", then test the models from .github/models/{accuracy,performance}/{suite}.txt
# otherwise test all models.
for suite in ${suites[@]}; do
for mode in ${modes[@]}; do
for dtype in ${dtypes[@]}; do
if [[ "${{ inputs.only_one_model }}" ]]; then
bash -e $WORKSPACE/scripts/inductor_xpu_test.sh $suite $dtype $mode ${{ inputs.test_mode }} xpu 0 static 1 0 ${{ inputs.only_one_model }}
elif [[ "${{ inputs.models }}" == "subset" ]]; then
models_subset_file="$WORKSPACE/.github/models/${{ inputs.test_mode }}/$suite.txt"
while read model; do
bash -e $WORKSPACE/scripts/inductor_xpu_test.sh $suite $dtype $mode ${{ inputs.test_mode }} xpu 0 static 1 0 $model
done < $models_subset_file
if [[ "${{ inputs.check_all_subset_models }}" == true ]]; then
python $WORKSPACE/scripts/check_inductor_report.py --models-file="$models_subset_file" \
--suite=$suite \
--dtype=$dtype \
--mode=$mode \
--test_mode=${{ inputs.test_mode }} \
--device=xpu \
--inductor-log-dir="$WORKSPACE/inductor_log"
fi
else
bash -e $WORKSPACE/scripts/inductor_xpu_test.sh $suite $dtype $mode ${{ inputs.test_mode }} xpu 0 static 1 0
fi
done
done
done
'
- name: Identify GPU
run: |
# Initializing oneAPI to enable sycl-ls, which is used in capture-hw-details.sh on Windows.
Invoke-BatchFile "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
bash -c './scripts/capture-hw-details.sh | tee -a $GITHUB_ENV'
- name: Report environment details
shell: bash
run: |
mkdir -p /c/gh${{ github.run_id }}/inductor_log
cat <<EOF | tee /c/gh${{ github.run_id }}/inductor_log/e2e.env
TIMESTAMP=$(date '+%Y%m%d%H%M%S')
JOB_NAME=${{ join(inputs.*, '-') }}
GITHUB_RUN_ID=$GITHUB_RUN_ID
GITHUB_RUN_NUMBER=$GITHUB_RUN_NUMBER
GITHUB_RUN_ATTEMPT=$GITHUB_RUN_ATTEMPT
E2E_MODE=${{ inputs.mode }}
E2E_TEST_MODE=${{ inputs.test_mode }}
E2E_SUITE=${{ inputs.suite }}
E2E_DTYPE=${{ inputs.dtype }}
PYTHON_VERSION=$PYTHON_VERSION
PYTORCH_REPO=pytorch/pytorch
PYTORCH_COMMIT_ID=$(<.github/pins/pytorch.txt)"
PYTORCH_VERSION=$PYTORCH_VERSION
BENCHMARK_REPO=$BENCHMARK_REPO
BENCHMARK_COMMIT_ID=$BENCHMARK_COMMIT_ID
TRITON_REPO=$GITHUB_REPOSITORY
TRITON_COMMIT_ID=$GITHUB_SHA
TORCHVISION_COMMIT_ID=$TORCHVISION_COMMIT_ID
TORCHAUDIO_COMMIT_ID=$TORCHAUDIO_COMMIT_ID
TIMM_COMMIT_ID=$TIMM_COMMIT_ID
LIBIGC1_VERSION=$LIBIGC1_VERSION
LEVEL_ZERO_VERSION=$LEVEL_ZERO_VERSION
GPU_DEVICE=$GPU_DEVICE
AGAMA_VERSION=$AGAMA_VERSION
OS=${{ runner.os }}
EOF
- name: Upload test logs
uses: actions/upload-artifact@v4
with:
name: logs
path: ${{ env.NEW_WORKSPACE }}\inductor_log
include-hidden-files: true
- name: Clean up workspace
if: ${{ always() }}
run: |
Remove-Item -LiteralPath ${{ env.NEW_WORKSPACE }} -Force -Recurse -ErrorAction Ignore
- name: Clean up temporary files
if: ${{ always() }}
shell: bash
run: |
rm -rf rm -rf /tmp/triton-* /tmp/tmp*