Skip to content

Commit b4afc62

Browse files
authored
[ROCm] Python 3.10 in ROCm CI, and ROCm 6.2.3 in MigraphX CI (#22527)
### Description Upgrade python from 3.9 to 3.10 in ROCm and MigraphX docker files and CI pipelines. Upgrade ROCm version to 6.2.3 in most places except ROCm CI, see comment below. Some improvements/upgrades on ROCm/Migraphx docker or pipeline: * rocm 6.0/6.1.3 => 6.2.3 * python 3.9 => 3.10 * Ubuntu 20.04 => 22.04 * Also upgrade ml_dtypes, numpy and scipy packages. * Fix message "ROCm version from ..." with correct file path in CMakeList.txt * Exclude some NHWC tests since ROCm EP lacks support for NHWC convolution. #### ROCm CI Pipeline: ROCm 6.1.3 is kept in the pipeline for now. - Failed after upgrading to ROCm 6.2.3: `HIPBLAS_STATUS_INVALID_VALUE ; GPU=0 ; hostname=76123b390aed ; file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_execution_provider.cc ; line=170 ; expr=hipblasSetStream(hipblas_handle_, stream);` . It need further investigation. - cupy issues: (1) It currently supports numpy < 1.27, might not work with numpy 2.x. So we locked numpy==1.26.4 for now. (2) cupy support of ROCm 6.2 is still in progress: cupy/cupy#8606. Note that miniconda issues: its libstdc++.so.6 and libgcc_s.so.1 might have conflict with the system ones. So we created links to use the system ones. #### MigraphX CI pipeline MigraphX CI does not use cupy, and we are able to use ROCm 6.2.3 and numpy 2.x in the pipeline. #### Other attempts Other things that I've tried which might help in the future: Attempt to use a single docker file for both ROCm and Migraphx: #22478 Upgrade to ubuntu 24.04 and python 3.12, and use venv like [this](https://github.com/microsoft/onnxruntime/blob/27903e7ff1dd7256cd2b277c03766b4f2ad9e2f1/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile). ### Motivation and Context In 1.20 release, ROCm nuget packaging pipeline will use 6.2: #22461. This upgrades rocm to 6.2.3 in CI pipelines to be consistent.
1 parent 28efacf commit b4afc62

File tree

11 files changed

+70
-61
lines changed

11 files changed

+70
-61
lines changed

cmake/CMakeLists.txt

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,50 @@ if (onnxruntime_USE_ROCM)
291291
message(FATAL_ERROR "ROCM does not support build with CUDA!")
292292
endif()
293293

294+
# replicate strategy used by pytorch to get ROCM_VERSION
295+
# https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
296+
# with modification
297+
if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
298+
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
299+
file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
300+
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
301+
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
302+
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm_version.h ****\n")
303+
file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
304+
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
305+
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
306+
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h ****\n")
307+
file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
308+
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
309+
endif()
310+
311+
if (ROCM_VERSION_MATCH)
312+
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
313+
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
314+
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
315+
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
316+
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
317+
318+
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
319+
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
320+
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
321+
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
322+
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
323+
else()
324+
message(FATAL_ERROR "Cannot determine ROCm version string")
325+
endif()
326+
327+
294328
if (NOT CMAKE_HIP_COMPILER)
295329
set(CMAKE_HIP_COMPILER "${onnxruntime_ROCM_HOME}/llvm/bin/clang++")
296330
endif()
297331

298332
if (NOT CMAKE_HIP_ARCHITECTURES)
299-
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
333+
if (ROCM_VERSION_DEV VERSION_LESS "6.2")
334+
message(FATAL_ERROR "CMAKE_HIP_ARCHITECTURES is not set when ROCm version < 6.2")
335+
else()
336+
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
337+
endif()
300338
endif()
301339

302340
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
@@ -328,35 +366,6 @@ if (onnxruntime_USE_ROCM)
328366
set(onnxruntime_HIPIFY_PERL ${HIPIFY_PERL_PATH}/hipify-perl)
329367
endif()
330368

331-
# replicate strategy used by pytorch to get ROCM_VERSION
332-
# https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
333-
# with modification
334-
if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
335-
file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
336-
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
337-
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
338-
file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
339-
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
340-
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
341-
file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
342-
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
343-
endif()
344-
345-
if (ROCM_VERSION_MATCH)
346-
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
347-
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
348-
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
349-
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
350-
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
351-
else()
352-
message(FATAL_ERROR "Cannot determine ROCm version string")
353-
endif()
354-
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
355-
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
356-
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
357-
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
358-
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
359-
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
360369
message("\n***** HIP LANGUAGE CONFIG INFO ****\n")
361370
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
362371
message("CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")

dockerfiles/Dockerfile.migraphx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Dockerfile to run ONNXRuntime with MIGraphX integration
66
#--------------------------------------------------------------------------
77

8-
FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
8+
FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
99

1010
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
1111
ARG ONNXRUNTIME_BRANCH=main

dockerfiles/Dockerfile.rocm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Dockerfile to run ONNXRuntime with ROCm integration
66
#--------------------------------------------------------------------------
77

8-
FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
8+
FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
99

1010
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
1111
ARG ONNXRUNTIME_BRANCH=main

dockerfiles/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ Nothing else from ONNX Runtime source tree will be copied/installed to the image
292292
Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropriate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
293293

294294
## MIGraphX
295-
**Ubuntu 20.04, ROCm6.0, MIGraphX**
295+
**Ubuntu 22.04, ROCm6.2.3, MIGraphX**
296296

297297
1. Build the docker image from the Dockerfile in this repository.
298298
```
@@ -306,7 +306,7 @@ Note: When running the container you built in Docker, please either use 'nvidia-
306306
```
307307

308308
## ROCm
309-
**Ubuntu 20.04, ROCm6.0**
309+
**Ubuntu 22.04, ROCm6.2.3**
310310

311311
1. Build the docker image from the Dockerfile in this repository.
312312
```

onnxruntime/test/providers/internal_testing/internal_testing_tests.cc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
159159

160160
// the internal NHWC operators are only included as part of contrib ops currently. as the EP requests the NHWC
161161
// version of the ONNX operator when matching a static kernel, those are required.
162-
#if !defined(DISABLE_CONTRIB_OPS)
162+
#if !defined(DISABLE_CONTRIB_OPS) && !defined(USE_ROCM)
163163
TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
164164
const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx";
165165

@@ -256,10 +256,6 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
256256
run_test(ort_model_path);
257257
}
258258

259-
// This test can be deprecated now as the code logic has been changed so the model is not applicable
260-
// TEST(InternalTestingEP, TestRegisterAllocatorHandlesUsageInMultipleSessions) {
261-
//}
262-
263259
// make sure allocators returned by SessionState::GetAllocator are valid when IExecutionProvider::ReplaceAllocator
264260
// is used. if something is off InferenceSession::Initialize will fail.
265261
TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorage) {

tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,7 @@ variables:
3737
- name: render
3838
value: 109
3939
- name: RocmVersion
40-
value: 6.1
41-
- name: RocmVersionPatchSuffix
42-
value: ".3"
40+
value: 6.2.3
4341

4442
jobs:
4543
- job: Linux_Build
@@ -66,7 +64,7 @@ jobs:
6664
parameters:
6765
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
6866
Context: tools/ci_build/github/linux/docker
69-
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
67+
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
7068
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
7169

7270
- task: Cache@2
@@ -165,7 +163,7 @@ jobs:
165163
parameters:
166164
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
167165
Context: tools/ci_build/github/linux/docker
168-
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
166+
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
169167
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
170168

171169
- task: CmdLine@2

tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,7 @@ variables:
3737
- name: render
3838
value: 109
3939
- name: RocmVersion
40-
value: 6.1
41-
- name: RocmVersionPatchSuffix
42-
value: ".3"
40+
value: 6.1.3
4341

4442
jobs:
4543
- job: Linux_Build
@@ -66,7 +64,7 @@ jobs:
6664
parameters:
6765
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
6866
Context: tools/ci_build/github/linux/docker
69-
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
67+
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
7068
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)
7169

7270
- task: Cache@2
@@ -166,7 +164,7 @@ jobs:
166164
parameters:
167165
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
168166
Context: tools/ci_build/github/linux/docker
169-
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
167+
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
170168
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)
171169

172170
- task: CmdLine@2
@@ -231,7 +229,11 @@ jobs:
231229
-e KERNEL_EXPLORER_TEST_USE_CUPY=1 \
232230
-e CUPY_CACHE_DIR=/build/Release \
233231
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
234-
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100
232+
/bin/bash -c "
233+
set -ex; \
234+
python --version; \
235+
ls /opt/miniconda/envs/rocm-ci/lib/; \
236+
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100"
235237
workingDirectory: $(Build.SourcesDirectory)
236238
displayName: 'Run kernel explorer tests'
237239
condition: succeededOrFailed()

tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ ARG LD_LIBRARY_PATH_ARG=${DEVTOOLSET_ROOTPATH}/usr/lib64:${DEVTOOLSET_ROOTPATH}/
66
ARG PREPEND_PATH=${DEVTOOLSET_ROOTPATH}/usr/bin:
77

88
FROM $BASEIMAGE AS base_image
9-
ARG ROCM_VERSION=5.5
9+
ARG ROCM_VERSION=6.2.3
1010

1111
#Add our own dependencies
1212
ADD scripts /tmp/scripts

tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
22
FROM ubuntu:22.04
33

4-
ARG ROCM_VERSION=6.0
4+
ARG ROCM_VERSION=6.2.3
55
ARG AMDGPU_VERSION=${ROCM_VERSION}
66
ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
77

@@ -68,7 +68,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
6868
# Create migraphx-ci environment
6969
ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci
7070
ENV CONDA_DEFAULT_ENV migraphx-ci
71-
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
71+
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10
7272
ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
7373

7474
# Enable migraphx-ci environment
@@ -80,4 +80,4 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
8080
# Install migraphx
8181
RUN apt update && apt install -y migraphx
8282

83-
RUN pip install numpy packaging ml_dtypes==0.3.0
83+
RUN pip install numpy packaging ml_dtypes==0.5.0

tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
22
FROM ubuntu:22.04
33

4-
ARG ROCM_VERSION=6.0
4+
ARG ROCM_VERSION=6.1.3
55
ARG AMDGPU_VERSION=${ROCM_VERSION}
66
ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
77

@@ -67,26 +67,30 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
6767
# Create rocm-ci environment
6868
ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/rocm-ci
6969
ENV CONDA_DEFAULT_ENV rocm-ci
70-
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
70+
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10
7171
ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
7272

7373
# Enable rocm-ci environment
7474
SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]
7575

76-
# ln -sf is needed to make sure that version `GLIBCXX_3.4.30' is found
76+
# Some DLLs in the conda environment have conflict with the one installed in Ubuntu system.
77+
# For example, the GCC version in the conda environment is 12.x, while the one in the Ubuntu 22.04 is 11.x.
78+
# ln -sf to make sure we always use libstdc++.so.6 and libgcc_s.so.1 in the system.
7779
RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libstdc++.so.6
80+
RUN ln -sf /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libgcc_s.so.1
7881

7982
RUN pip install packaging \
80-
ml_dtypes==0.3.0 \
83+
ml_dtypes==0.5.0 \
8184
pytest==7.4.4 \
8285
pytest-xdist \
8386
pytest-rerunfailures \
84-
scipy==1.10.0 \
85-
numpy==1.24.1
87+
scipy==1.14.1 \
88+
numpy==1.26.4
8689

8790
RUN apt install -y git
8891

8992
# Install Cupy to decrease CPU utilization
93+
# Note that the version of Cupy requires numpy < 1.27
9094
RUN git clone https://github.com/ROCm/cupy && cd cupy && \
9195
git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \
9296
export CUPY_INSTALL_USE_HIP=1 && \

0 commit comments

Comments
 (0)