Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a736c41
enable xpu ci test
DiweiSun Aug 19, 2025
7c96ad4
Revert "enable xpu ci test"
DiweiSun Aug 20, 2025
d1122fc
enable ci test for xpu
DiweiSun Aug 20, 2025
4593d95
Create ci_test_xpu.sh
DiweiSun Aug 20, 2025
7d90b8c
Update .github/workflows/pr-test-xpu.yml
DiweiSun Aug 22, 2025
e6bc407
Update .github/workflows/pr-test-xpu.yml
DiweiSun Aug 22, 2025
c34601f
fix for trigger scenarios
DiweiSun Sep 2, 2025
3085c2b
port from pytorch repo
DiweiSun Sep 2, 2025
d9ab09e
Rename action.yml to xpu-action.yml
DiweiSun Sep 2, 2025
f87892a
update to align with pytorch
DiweiSun Sep 2, 2025
c6f07b5
Revert "Rename action.yml to xpu-action.yml"
DiweiSun Sep 4, 2025
544593a
Revert "port from pytorch repo"
DiweiSun Sep 4, 2025
2e1dc50
Update .github/workflows/pr-test-xpu.yml
DiweiSun Sep 4, 2025
188a0f8
debug for runner
DiweiSun Sep 5, 2025
421d02c
lint format fix
DiweiSun Sep 8, 2025
6f6cd17
format fix
DiweiSun Sep 8, 2025
bae7000
format fix
DiweiSun Sep 8, 2025
7bd3d29
format fix
DiweiSun Sep 9, 2025
4fd2909
format fix
DiweiSun Sep 9, 2025
4a7d9af
format fix
DiweiSun Sep 9, 2025
c3f4384
format fix
DiweiSun Sep 9, 2025
e8936cb
format fix
DiweiSun Sep 10, 2025
50e56ec
trigger by tag only
DiweiSun Sep 15, 2025
5a46341
add xpu label for xpuci
DiweiSun Sep 16, 2025
030121f
fix docker path
DiweiSun Sep 17, 2025
965677d
debug for permission issue
DiweiSun Sep 17, 2025
419d824
debug for permission issue
DiweiSun Sep 17, 2025
ac3ebb9
debug for permission issue
DiweiSun Sep 17, 2025
7cd2475
format fix
DiweiSun Sep 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ ciflow_push_tags:
- ciflow/tutorials
- ciflow/rocm
- ciflow/4xh100
- ciflow/xpu
9 changes: 9 additions & 0 deletions .github/scripts/ci_test_xpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir
python3 setup.py install

pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'

cd test/quantization
pytest -v -s *.py
240 changes: 240 additions & 0 deletions .github/workflows/pr-test-xpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether

name: xpu-test

on:
push:
tags:
- ciflow/xpu/*

permissions:
id-token: write
contents: read

concurrency:
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

jobs:
test:
# Don't run on forked repos or empty test matrix
# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
timeout-minutes: 60
runs-on: linux.idc.xpu
env:
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

- name: Checkout Torchao
uses: actions/checkout@v4

- name: Clean all stopped docker containers
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi

- name: Runner health check system info
if: always()
shell: bash
run: |
cat /etc/os-release || true
cat /etc/apt/sources.list.d/oneAPI.list || true
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
whoami

- name: Runner health check xpu-smi
if: always()
shell: bash
run: |
timeout 30 xpu-smi discovery || true

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Runner diskspace health check
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
if: always()

- name: Runner health check disconnect on failure
if: ${{ failure() }}
shell: bash
run: |
killall runsvc.sh

- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

- name: XPU set GPU_FLAG
shell: bash
run: |
# Add render group for container creation.
render_gid=`cat /etc/group | grep render | cut -d: -f3`
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"

- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

- name: Debug Check workspace structure
run: |
echo "🔍 Current workspace: $GITHUB_WORKSPACE"
echo "📁 Current directory: $(pwd)"
echo "📄 Listing parent directory:"
ls -la ..
echo "📄 Listing ../pytorch:"
ls -la ../pytorch || echo "❌ ../pytorch does not exist"
echo "📄 Checking for .ci/docker in PyTorch:"
if [ -d "../pytorch/.ci/docker" ]; then
echo "✅ Found ../pytorch/.ci/docker"
ls -la ../pytorch/.ci/docker
else
echo "❌ ../pytorch/.ci/docker does not exist"
fi

- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ env.DOCKER_IMAGE }}
working-directory: ../pytorch/.ci/docker

- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Test
id: test
env:
TEST_COMMAND: .github/scripts/ci_test_xpu.sh
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
timeout-minutes: 60
run: |
set -x

# detached container should get cleaned up by teardown_ec2_linux
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e BRANCH \
-e SHA1 \
--user $(id -u):$(id -g) \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
--privileged \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# save container name for later step
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
docker exec -t "${container_name}" sh -c "bash ${env.TEST_COMMAND}"

- name: Change permissions
if: ${{ always() && steps.test.conclusion }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R jenkins:jenkins test"

- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

- name: Stop container before exit
if: always()
run: |
# Workaround for multiple runners on same IDC node
docker stop "${{ env.CONTAINER_NAME }}"

- name: Store Core dumps on GitHub
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*

- name: Teardown XPU
uses: pytorch/pytorch/.github/actions/teardown-xpu@main
Loading