From ca71b1f9dd77a63ca9dafc3778c54300cc35d56e Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Mon, 20 Jan 2025 10:55:24 -0800 Subject: [PATCH 01/16] try Signed-off-by: Sidorov, Dmitry --- .../Scalar/FPBuiltinFnSelection.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index dd423b42ab0ec..658d2965aff1b 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -64,6 +64,30 @@ static bool replaceWithAltMathFunction(FPBuiltinIntrinsic &BuiltinCall, return true; } +static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { + IRBuilder<> IRBuilder(&BuiltinCall); + SmallVector Args(BuiltinCall.args()); + Value *Replacement = nullptr; + switch (BuiltinCall.getIntrinsicID()) { + case Intrinsic::fpbuiltin_fdiv: + Replacement = IRBuilder.CreateFDiv(Args[0], Args[1]); + break; + case Intrinsic::fpbuiltin_sqrt: + Replacement = + IRBuilder.CreateIntrinsic(BuiltinCall.getType(), Intrinsic::sqrt, Args); + break; + default: + return false; + } + BuiltinCall.replaceAllUsesWith(Replacement); + cast(Replacement)->copyFastMathFlags(&BuiltinCall); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" + << BuiltinCall.getCalledFunction()->getName() + << "` with equivalent IR. \n `"); + return true; + +} + static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // Replace the call to the fpbuiltin intrinsic with a call // to the corresponding function from the alternate math library. @@ -154,6 +178,12 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, } } + if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) { + bool ToReturn = replaceWithNVPTXCalls(BuiltinCall); + if (ToReturn) + return true; + } + /// Call TLI to select a function implementation to call StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall); if (ImplName.empty()) { From 58269113517c55e2cd09438c4013a86a761ea57d Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 21 Jan 2025 06:15:57 -0800 Subject: [PATCH 02/16] [SYCL][NVPTX] Set default fdiv and sqrt for llvm.fpbuiltin for 3.0 max-error We are lacking implementation for llvm.fpbuiltin intrinsics for NVPTX target. This patch adds type-and fast-math- dependent mapping for llvm.fpbuiltin.fdiv and llvm.fpbuiltin.sqrt with 3.0 max-error on nvvm intrinsics: fp32 scalar @llvm.fpbuiltin.fdiv -> @llvm.nvvm.div.approx.f fp32 scalar @llvm.fpbuiltin.fdiv fast -> @llvm.nvvm.div.approx.ftz.f fp32 scalar @llvm.fpbuiltin.sqrt -> @llvm.nvvm.sqrt.approx.f fp32 scalar @llvm.fpbuiltin.sqrt fast -> @llvm.nvvm.sqrt.approx.ftz.f vector or non-fp32 scalar llvm.fpbuiltin.fdiv -> fdiv vector or non-fp32 scalar llvm.fpbuiltin.sqrt -> llvm.sqrt Signed-off-by: Sidorov, Dmitry --- .../Scalar/FPBuiltinFnSelection.cpp | 73 +++++++++++------ ...p-builtin-intrinsics-nvvm-max-error-3.0.ll | 81 +++++++++++++++++++ 2 files changed, 128 insertions(+), 26 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 658d2965aff1b..56755f1c775dd 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/InitializePasses.h" #include "llvm/Support/FormatVariadic.h" @@ -64,30 +65,6 @@ static bool replaceWithAltMathFunction(FPBuiltinIntrinsic &BuiltinCall, return true; } -static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { - IRBuilder<> IRBuilder(&BuiltinCall); - SmallVector Args(BuiltinCall.args()); - Value *Replacement = nullptr; - switch (BuiltinCall.getIntrinsicID()) { - case Intrinsic::fpbuiltin_fdiv: - Replacement = IRBuilder.CreateFDiv(Args[0], Args[1]); - break; - case Intrinsic::fpbuiltin_sqrt: - Replacement = - IRBuilder.CreateIntrinsic(BuiltinCall.getType(), Intrinsic::sqrt, Args); - break; - default: - return false; - } - BuiltinCall.replaceAllUsesWith(Replacement); - cast(Replacement)->copyFastMathFlags(&BuiltinCall); - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" - << BuiltinCall.getCalledFunction()->getName() - << "` with equivalent IR. \n `"); - return true; - -} - static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // Replace the call to the fpbuiltin intrinsic with a call // to the corresponding function from the alternate math library. @@ -130,6 +107,48 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { return true; } +static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { + IRBuilder<> IRBuilder(&BuiltinCall); + SmallVector Args(BuiltinCall.args()); + Value *Replacement = nullptr; + // To chose between ftz and non-ftz intrinsic. + FastMathFlags FMF = BuiltinCall.getFastMathFlags(); + auto *Type = BuiltinCall.getType(); + // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have + // approximate variants for sin, cos, exp2 and log2. + // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, use + // standart for LLVM math operations. Also nvvm fdiv and sqrt intrisics + // support only float type. + switch (BuiltinCall.getIntrinsicID()) { + case Intrinsic::fpbuiltin_fdiv: + if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) + return replaceWithLLVMIR(BuiltinCall); + Replacement = + IRBuilder.CreateIntrinsic(Type, + FMF.isFast() + ? Intrinsic::nvvm_div_approx_ftz_f + : Intrinsic::nvvm_div_approx_f, Args); + break; + case Intrinsic::fpbuiltin_sqrt: + if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) + return replaceWithLLVMIR(BuiltinCall); + Replacement = + IRBuilder.CreateIntrinsic(BuiltinCall.getType(), + FMF.isFast() + ? Intrinsic::nvvm_sqrt_approx_ftz_f + : Intrinsic::nvvm_sqrt_approx_f, Args); + break; + default: + return false; + } + BuiltinCall.replaceAllUsesWith(Replacement); + cast(Replacement)->copyFastMathFlags(&BuiltinCall); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" + << BuiltinCall.getCalledFunction()->getName() + << "` with equivalent IR. \n `"); + return true; +} + static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, FPBuiltinIntrinsic &BuiltinCall) { @@ -178,9 +197,11 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, } } + // We don't have implementation for CUDA approximate precision builtins. + // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known + // - skip to replaceWithAltMathFunction. if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) { - bool ToReturn = replaceWithNVPTXCalls(BuiltinCall); - if (ToReturn) + if (replaceWithNVPTXCalls(BuiltinCall)) return true; } diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll new file mode 100644 index 0000000000000..346e3475b5d75 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll @@ -0,0 +1,81 @@ +; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: @test_fdiv +; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}} +define void @test_fdiv(float %d1, <2 x float> %v2d1, + float %d2, <2 x float> %v2d2) { +entry: + %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0 + %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0 + ret void +} + +; CHECK-LABEL: @test_fdiv_fast +; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.ftz.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}} +define void @test_fdiv_fast(float %d1, <2 x float> %v2d1, + float %d2, <2 x float> %v2d2) { +entry: + %t0 = call fast float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0 + %t1 = call fast <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) +declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>) + +; CHECK-LABEL: @test_fdiv_double +; CHECK: %{{.*}} = fdiv double %{{.*}}, %{{.*}} +; CHECK: %{{.*}} = fdiv <2 x double> %{{.*}}, %{{.*}} +define void @test_fdiv_double(double %d1, <2 x double> %v2d1, + double %d2, <2 x double> %v2d2) { +entry: + %t0 = call double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0 + %t1 = call <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + ret void +} + +declare double @llvm.fpbuiltin.fdiv.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>) + +; CHECK-LABEL: @test_sqrt +; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}}) +; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) +define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) { +entry: + %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #0 + %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0 + ret void +} + +; CHECK-LABEL: @test_sqrt_fast +; CHECK: %{{.*}} = call fast float @llvm.nvvm.sqrt.approx.ftz.f(float %{{.*}}) +; CHECK: %{{.*}} = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) +define void @test_sqrt_fast(float %d, <2 x float> %v2d, <4 x float> %v4d) { +entry: + %t0 = call fast float @llvm.fpbuiltin.sqrt.f32(float %d) #0 + %t1 = call fast <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0 + ret void +} + +declare float @llvm.fpbuiltin.sqrt.f32(float) +declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>) + +; CHECK-LABEL: @test_sqrt_double +; CHECK: %{{.*}} = call double @llvm.sqrt.f64(double %{{.*}}) +; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}}) +define void @test_sqrt_double(double %d, <2 x double> %v2d) { +entry: + %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0 + %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0 + ret void +} + +declare double @llvm.fpbuiltin.sqrt.f64(double) +declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) + +attributes #0 = { "fpbuiltin-max-error"="3.0" } From ee6428759f1883ca8409c2ab9d5dd1bf363c5eb0 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 22 Jan 2025 04:25:38 -0800 Subject: [PATCH 03/16] apply comments Signed-off-by: Sidorov, Dmitry --- .../Transforms/Scalar/FPBuiltinFnSelection.cpp | 17 ++++++----------- .../fp-builtin-intrinsics-nvvm-max-error-3.0.ll | 12 +----------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 56755f1c775dd..bc67518e27f80 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -107,12 +107,12 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { return true; } -static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { +// This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error +// attribute to the appropriate nvvm approximate intrinsics if it's possible. +static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { IRBuilder<> IRBuilder(&BuiltinCall); SmallVector Args(BuiltinCall.args()); Value *Replacement = nullptr; - // To chose between ftz and non-ftz intrinsic. - FastMathFlags FMF = BuiltinCall.getFastMathFlags(); auto *Type = BuiltinCall.getType(); // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have // approximate variants for sin, cos, exp2 and log2. @@ -124,19 +124,14 @@ static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); Replacement = - IRBuilder.CreateIntrinsic(Type, - FMF.isFast() - ? Intrinsic::nvvm_div_approx_ftz_f - : Intrinsic::nvvm_div_approx_f, Args); + IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args); break; case Intrinsic::fpbuiltin_sqrt: if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); Replacement = IRBuilder.CreateIntrinsic(BuiltinCall.getType(), - FMF.isFast() - ? Intrinsic::nvvm_sqrt_approx_ftz_f - : Intrinsic::nvvm_sqrt_approx_f, Args); + Intrinsic::nvvm_sqrt_approx_f, Args); break; default: return false; @@ -201,7 +196,7 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known // - skip to replaceWithAltMathFunction. if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) { - if (replaceWithNVPTXCalls(BuiltinCall)) + if (replaceWithApproxNVPTXCalls(BuiltinCall)) return true; } diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll index 346e3475b5d75..0827c668b8609 100644 --- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll @@ -15,7 +15,7 @@ entry: } ; CHECK-LABEL: @test_fdiv_fast -; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.ftz.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}}) ; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}} define void @test_fdiv_fast(float %d1, <2 x float> %v2d1, float %d2, <2 x float> %v2d2) { @@ -52,16 +52,6 @@ entry: ret void } -; CHECK-LABEL: @test_sqrt_fast -; CHECK: %{{.*}} = call fast float @llvm.nvvm.sqrt.approx.ftz.f(float %{{.*}}) -; CHECK: %{{.*}} = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) -define void @test_sqrt_fast(float %d, <2 x float> %v2d, <4 x float> %v4d) { -entry: - %t0 = call fast float @llvm.fpbuiltin.sqrt.f32(float %d) #0 - %t1 = call fast <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0 - ret void -} - declare float @llvm.fpbuiltin.sqrt.f32(float) declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>) From b16492f0b37ffbd28b86778ff61f7759652d192f Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 22 Jan 2025 04:35:29 -0800 Subject: [PATCH 04/16] format Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index bc67518e27f80..bdaa253d9ce94 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -129,9 +129,8 @@ static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { case Intrinsic::fpbuiltin_sqrt: if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); - Replacement = - IRBuilder.CreateIntrinsic(BuiltinCall.getType(), - Intrinsic::nvvm_sqrt_approx_f, Args); + Replacement = IRBuilder.CreateIntrinsic( + BuiltinCall.getType(), Intrinsic::nvvm_sqrt_approx_f, Args); break; default: return false; From 01b30325b2818f5a6145c49a09a446eb5cf074d0 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Thu, 23 Jan 2025 03:25:57 -0800 Subject: [PATCH 05/16] rename Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index bdaa253d9ce94..0e1a1510897b7 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -109,7 +109,9 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error // attribute to the appropriate nvvm approximate intrinsics if it's possible. -static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) { +// If it's not possible - fallback to standart LLVM intrinsic or instruction. +static bool replaceWithApproxNVPTXCallsOrFallback( + FPBuiltinIntrinsic &BuiltinCall) { IRBuilder<> IRBuilder(&BuiltinCall); SmallVector Args(BuiltinCall.args()); Value *Replacement = nullptr; @@ -195,7 +197,7 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known // - skip to replaceWithAltMathFunction. if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) { - if (replaceWithApproxNVPTXCalls(BuiltinCall)) + if (replaceWithApproxNVPTXCallsOrFallback(BuiltinCall)) return true; } From 5ae0e94005b76e453d97848f9fbecbbdbfb7c52d Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Thu, 23 Jan 2025 04:17:50 -0800 Subject: [PATCH 06/16] typo Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 0e1a1510897b7..6a5891f7c5236 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -109,7 +109,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error // attribute to the appropriate nvvm approximate intrinsics if it's possible. -// If it's not possible - fallback to standart LLVM intrinsic or instruction. +// If it's not possible - fallback to standard C/C++ library LLVM intrinsic or +// instruction. static bool replaceWithApproxNVPTXCallsOrFallback( FPBuiltinIntrinsic &BuiltinCall) { IRBuilder<> IRBuilder(&BuiltinCall); From 6720ed0ad6aed5968dfda390800d50f6e807cff3 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Thu, 23 Jan 2025 04:21:20 -0800 Subject: [PATCH 07/16] typo Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 6a5891f7c5236..c21473a74e758 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -109,8 +109,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error // attribute to the appropriate nvvm approximate intrinsics if it's possible. -// If it's not possible - fallback to standard C/C++ library LLVM intrinsic or -// instruction. +// If it's not possible - fallback to instruction or standard C/C++ library LLVM +// intrinsic. static bool replaceWithApproxNVPTXCallsOrFallback( FPBuiltinIntrinsic &BuiltinCall) { IRBuilder<> IRBuilder(&BuiltinCall); @@ -119,9 +119,10 @@ static bool replaceWithApproxNVPTXCallsOrFallback( auto *Type = BuiltinCall.getType(); // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have // approximate variants for sin, cos, exp2 and log2. - // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, use - // standart for LLVM math operations. Also nvvm fdiv and sqrt intrisics - // support only float type. + // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, + // fallback to instruction or standard C/C++ library LLVM intrinsic. Also + // nvvm fdiv and sqrt intrisics support only float type, so fallback in this + // case as well. switch (BuiltinCall.getIntrinsicID()) { case Intrinsic::fpbuiltin_fdiv: if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) From 5b6411ab23ee9504ce7fc8bdf402af4d20913110 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Thu, 23 Jan 2025 06:02:27 -0800 Subject: [PATCH 08/16] formt Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index c21473a74e758..7719de44b5f0c 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -111,8 +111,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // attribute to the appropriate nvvm approximate intrinsics if it's possible. // If it's not possible - fallback to instruction or standard C/C++ library LLVM // intrinsic. -static bool replaceWithApproxNVPTXCallsOrFallback( - FPBuiltinIntrinsic &BuiltinCall) { +static bool +replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall) { IRBuilder<> IRBuilder(&BuiltinCall); SmallVector Args(BuiltinCall.args()); Value *Replacement = nullptr; From 32d8f6bbcf7248106b33474f84eb758b2478ac0f Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 28 Jan 2025 03:30:37 -0800 Subject: [PATCH 09/16] wip Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 7719de44b5f0c..17003a0c9e5f5 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -112,7 +112,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { // If it's not possible - fallback to instruction or standard C/C++ library LLVM // intrinsic. static bool -replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall) { +replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall, + std::optional Accuracy) { IRBuilder<> IRBuilder(&BuiltinCall); SmallVector Args(BuiltinCall.args()); Value *Replacement = nullptr; @@ -198,8 +199,9 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, // We don't have implementation for CUDA approximate precision builtins. // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known // - skip to replaceWithAltMathFunction. - if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) { - if (replaceWithApproxNVPTXCallsOrFallback(BuiltinCall)) + if (T.isNVPTX()) + if (replaceWithApproxNVPTXCallsOrFallback( + BuiltinCall, BuiltinCall.getRequiredAccuracy())) return true; } From 36f86888e50bf622f8ab9bd6c8dbbf782d397d89 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 28 Jan 2025 04:21:14 -0800 Subject: [PATCH 10/16] fix bug, apply comment Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 9 ++++++--- ...or-3.0.ll => fp-builtin-intrinsics-nvvm-approx.ll} | 11 ++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) rename llvm/test/CodeGen/NVPTX/{fp-builtin-intrinsics-nvvm-max-error-3.0.ll => fp-builtin-intrinsics-nvvm-approx.ll} (92%) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 17003a0c9e5f5..d69cc456f4086 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -107,7 +107,7 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { return true; } -// This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error +// This function lowers llvm.fpbuiltin. intrinsic functions with max-error // attribute to the appropriate nvvm approximate intrinsics if it's possible. // If it's not possible - fallback to instruction or standard C/C++ library LLVM // intrinsic. @@ -126,12 +126,16 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall, // case as well. switch (BuiltinCall.getIntrinsicID()) { case Intrinsic::fpbuiltin_fdiv: + if (Accuracy.value() != 2.5) + return false; if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); Replacement = IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args); break; case Intrinsic::fpbuiltin_sqrt: + if (Accuracy.value() != 3.0) + return false; if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); Replacement = IRBuilder.CreateIntrinsic( @@ -199,11 +203,10 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, // We don't have implementation for CUDA approximate precision builtins. // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known // - skip to replaceWithAltMathFunction. - if (T.isNVPTX()) + if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() != 0.5) if (replaceWithApproxNVPTXCallsOrFallback( BuiltinCall, BuiltinCall.getRequiredAccuracy())) return true; - } /// Call TLI to select a function implementation to call StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall); diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll similarity index 92% rename from llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll rename to llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll index 0827c668b8609..51b18b4e3bab6 100644 --- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll @@ -47,8 +47,8 @@ declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>) ; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) { entry: - %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #0 - %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0 + %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #1 + %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #1 ret void } @@ -60,12 +60,13 @@ declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>) ; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}}) define void @test_sqrt_double(double %d, <2 x double> %v2d) { entry: - %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0 - %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0 + %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #1 + %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #1 ret void } declare double @llvm.fpbuiltin.sqrt.f64(double) declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) -attributes #0 = { "fpbuiltin-max-error"="3.0" } +attributes #0 = { "fpbuiltin-max-error"="2.5" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } From cc7333f60bcf4f12d1046e559304f2d68666673c Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Tue, 28 Jan 2025 04:30:42 -0800 Subject: [PATCH 11/16] format Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index d69cc456f4086..1b88e70cb231e 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -205,7 +205,7 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, // - skip to replaceWithAltMathFunction. if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() != 0.5) if (replaceWithApproxNVPTXCallsOrFallback( - BuiltinCall, BuiltinCall.getRequiredAccuracy())) + BuiltinCall, BuiltinCall.getRequiredAccuracy())) return true; /// Call TLI to select a function implementation to call From 9ea17f831e83d4d37a64a4323f2fcc9dc9ff7607 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 29 Jan 2025 05:13:41 -0800 Subject: [PATCH 12/16] Address comment and add 0.5 max error for nvptx Signed-off-by: Sidorov, Dmitry --- .../Scalar/FPBuiltinFnSelection.cpp | 21 +- ...p-builtin-intrinsics-nvvm-max-error-0.5.ll | 219 ++++++++++++++++++ 2 files changed, 230 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index 1b88e70cb231e..f742b6243bdfd 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -126,7 +126,7 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall, // case as well. switch (BuiltinCall.getIntrinsicID()) { case Intrinsic::fpbuiltin_fdiv: - if (Accuracy.value() != 2.5) + if (Accuracy.value() < 2.0) return false; if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); @@ -134,7 +134,7 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall, IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args); break; case Intrinsic::fpbuiltin_sqrt: - if (Accuracy.value() != 3.0) + if (Accuracy.value() < 1.0) return false; if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); @@ -182,10 +182,11 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, return replaceWithLLVMIR(BuiltinCall); // Several functions for "sycl" and "cuda" requires "0.5" accuracy levels, - // which means correctly rounded results. For now x86 host AltMathLibrary - // doesn't have such ability. For such accuracy level, the fpbuiltins - // should be replaced by equivalent IR operation or llvmbuiltins. - if (T.isX86() && BuiltinCall.getRequiredAccuracy().value() == 0.5) { + // which means correctly rounded results. For now x86 host and NVPTX + // AltMathLibrary doesn't have such ability. For such accuracy level, the + // fpbuiltins should be replaced by equivalent IR operation or llvmbuiltins. + if ((T.isX86() || T.isNVPTX()) && + BuiltinCall.getRequiredAccuracy().value() == 0.5) { switch (BuiltinCall.getIntrinsicID()) { case Intrinsic::fpbuiltin_fadd: case Intrinsic::fpbuiltin_fsub: @@ -200,10 +201,10 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, } } - // We don't have implementation for CUDA approximate precision builtins. - // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known - // - skip to replaceWithAltMathFunction. - if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() != 0.5) + // AltMathLibrary don't have implementation for CUDA approximate precision + // builtins. Lets map them on NVPTX intrinsics. If no appropriate intrinsics + // are known - skip to emit an error. + if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() > 0.5) if (replaceWithApproxNVPTXCallsOrFallback( BuiltinCall, BuiltinCall.getRequiredAccuracy())) return true; diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll new file mode 100644 index 0000000000000..af1f00dee16c0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll @@ -0,0 +1,219 @@ +; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s + +; Several functions for "sycl" and "cuda" requires "0.5" accuracy levels, +; Test if these fpbuiltins could be replaced by equivalaent IR operations +; or llvm builtins. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: @svml_fadd +; CHECK: %0 = fadd fast float %f1, %f2 +; CHECK: %1 = fadd fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = fadd fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = fadd fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = fadd fast double %d1, %d2 +; CHECK: %5 = fadd fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = fadd fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = fadd fast <8 x double> %v8d1, %v8d2 +define void @svml_fadd(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.fadd.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.fadd.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fadd.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.fadd.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_fsub +; CHECK: %0 = fsub fast float %f1, %f2 +; CHECK: %1 = fsub fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = fsub fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = fsub fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = fsub fast double %d1, %d2 +; CHECK: %5 = fsub fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = fsub fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = fsub fast <8 x double> %v8d1, %v8d2 +define void @svml_fsub(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.fsub.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.fsub.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fsub.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.fsub.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_fmul +; CHECK: %0 = fmul fast float %f1, %f2 +; CHECK: %1 = fmul fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = fmul fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = fmul fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = fmul fast double %d1, %d2 +; CHECK: %5 = fmul fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = fmul fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = fmul fast <8 x double> %v8d1, %v8d2 +define void @svml_fmul(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.fmul.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.fmul.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fmul.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.fmul.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_fdiv +; CHECK: %0 = fdiv fast double %d1, %d2 +; CHECK: %1 = fdiv fast <2 x double> %v2d1, %v2d2 +; CHECK: %2 = fdiv fast <4 x double> %v4d1, %v4d2 +; CHECK: %3 = fdiv fast <8 x double> %v8d1, %v8d2 +define void @svml_fdiv(double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0 + %t1_0 = call fast <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t2_0 = call fast <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t3_0 = call fast <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare double @llvm.fpbuiltin.fdiv.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_frem +; CHECK: %0 = frem fast float %f1, %f2 +; CHECK: %1 = frem fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = frem fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = frem fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = frem fast double %d1, %d2 +; CHECK: %5 = frem fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = frem fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = frem fast <8 x double> %v8d1, %v8d2 +define void @svml_frem(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.frem.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.frem.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.frem.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.frem.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_sqrt +; CHECK: %0 = call double @llvm.sqrt.f64(double %d) +; CHECK: %1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %v2d) +; CHECK: %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %v4d) +; CHECK: %3 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %v8d) +define void @svml_sqrt(double %d, <2 x double> %v2d, <4 x double> %v4d, <8 x double> %v8d) { +entry: + %t4_0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0 + %t5_0 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0 + %t6_0 = call <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double> %v4d) #0 + %t7_0 = call <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double> %v8d) #0 + ret void +} + +declare double @llvm.fpbuiltin.sqrt.f64(double) +declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) +declare <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double>) +declare <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double>) + +; CHECK-LABEL: @svml_ldexp +; CHECK: %0 = call fast float @llvm.ldexp.f32.i32(float %f1, i32 %f2) +; CHECK: %1 = call fast <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2) +; CHECK: %2 = call fast <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2) +; CHECK: %3 = call fast <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2) +; CHECK: %4 = call fast double @llvm.ldexp.f64.i32(double %d1, i32 %d2) +; CHECK: %5 = call fast <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2) +; CHECK: %6 = call fast <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2) +; CHECK: %7 = call fast <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2) +define void @svml_ldexp(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + i32 %f2, <4 x i32> %v4f2, <8 x i32> %v8f2, <16 x i32> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + i32 %d2, <2 x i32> %v2d2, <4 x i32> %v4d2, <8 x i32> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.ldexp.f32.i32(float %f1, i32 %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.ldexp.f64.i32(double %d1, i32 %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.ldexp.f32.i32(float, i32) +declare <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) +declare <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) +declare <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>) +declare double @llvm.fpbuiltin.ldexp.f64.i32(double, i32) +declare <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) +declare <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) +declare <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) + +attributes #0 = { "fpbuiltin-max-error"="0.5" } From 700ae48f49bd9e0d23f52b753c2503eedbb10959 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 29 Jan 2025 06:07:57 -0800 Subject: [PATCH 13/16] format Signed-off-by: Sidorov, Dmitry --- llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index f742b6243bdfd..b64241d2fd809 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -138,8 +138,8 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall, return false; if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) return replaceWithLLVMIR(BuiltinCall); - Replacement = IRBuilder.CreateIntrinsic( - BuiltinCall.getType(), Intrinsic::nvvm_sqrt_approx_f, Args); + Replacement = + IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_sqrt_approx_f, Args); break; default: return false; From 2df0a72ec97416d06c6ed0a23f3f83f0531e7219 Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 29 Jan 2025 10:32:01 -0800 Subject: [PATCH 14/16] apply suggestions Signed-off-by: Sidorov, Dmitry --- .../CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll index af1f00dee16c0..3777ad8c52b6d 100644 --- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll @@ -1,8 +1,8 @@ ; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s -; Several functions for "sycl" and "cuda" requires "0.5" accuracy levels, +; Several functions for SYCL and CUDA requires "0.5" accuracy levels, ; Test if these fpbuiltins could be replaced by equivalaent IR operations -; or llvm builtins. +; or LLVM builtins. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-nvidia-cuda" From e86be33d61863f66f3238a2362398650c0273eac Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 29 Jan 2025 13:43:39 -0800 Subject: [PATCH 15/16] add test Signed-off-by: Sidorov, Dmitry --- .../fp-builtin-intrinsics-nvvm-approx.ll | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll index 51b18b4e3bab6..d341053058a22 100644 --- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll @@ -25,6 +25,17 @@ entry: ret void } +; CHECK-LABEL: @test_fdiv_max_error +; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}} +define void @test_fdiv_max_error(float %d1, <2 x float> %v2d1, + float %d2, <2 x float> %v2d2) { +entry: + %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #2 + %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #2 + ret void +} + declare float @llvm.fpbuiltin.fdiv.f32(float, float) declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>) @@ -52,6 +63,16 @@ entry: ret void } +; CHECK-LABEL: @test_sqrt_max_error +; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}}) +; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) +define void @test_sqrt_max_error(float %d, <2 x float> %v2d, <4 x float> %v4d) { +entry: + %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #2 + %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #2 + ret void +} + declare float @llvm.fpbuiltin.sqrt.f32(float) declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>) @@ -70,3 +91,4 @@ declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) attributes #0 = { "fpbuiltin-max-error"="2.5" } attributes #1 = { "fpbuiltin-max-error"="3.0" } +attributes #1 = { "fpbuiltin-max-error"="10.0" } From 6113cc3f4ace250d8da3cab6eaf83fabc87fec7c Mon Sep 17 00:00:00 2001 From: "Sidorov, Dmitry" Date: Wed, 29 Jan 2025 16:24:32 -0800 Subject: [PATCH 16/16] fix typo Signed-off-by: Sidorov, Dmitry --- llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll index d341053058a22..6c7ce8af804d9 100644 --- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll @@ -91,4 +91,4 @@ declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) attributes #0 = { "fpbuiltin-max-error"="2.5" } attributes #1 = { "fpbuiltin-max-error"="3.0" } -attributes #1 = { "fpbuiltin-max-error"="10.0" } +attributes #2 = { "fpbuiltin-max-error"="10.0" }