From ca71b1f9dd77a63ca9dafc3778c54300cc35d56e Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Mon, 20 Jan 2025 10:55:24 -0800
Subject: [PATCH 01/16] try

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../Scalar/FPBuiltinFnSelection.cpp           | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index dd423b42ab0ec..658d2965aff1b 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -64,6 +64,30 @@ static bool replaceWithAltMathFunction(FPBuiltinIntrinsic &BuiltinCall,
   return true;
 }
 
+static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
+  IRBuilder<> IRBuilder(&BuiltinCall);
+  SmallVector<Value *> Args(BuiltinCall.args());
+  Value *Replacement = nullptr;
+  switch (BuiltinCall.getIntrinsicID()) {
+  case Intrinsic::fpbuiltin_fdiv:
+    Replacement = IRBuilder.CreateFDiv(Args[0], Args[1]);
+    break;
+  case Intrinsic::fpbuiltin_sqrt:
+    Replacement =
+        IRBuilder.CreateIntrinsic(BuiltinCall.getType(), Intrinsic::sqrt, Args);
+    break;
+  default:
+    return false;
+  }
+  BuiltinCall.replaceAllUsesWith(Replacement);
+  cast<Instruction>(Replacement)->copyFastMathFlags(&BuiltinCall);
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << BuiltinCall.getCalledFunction()->getName()
+                    << "` with equivalent IR. \n `");
+  return true;
+
+}
+
 static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
   // Replace the call to the fpbuiltin intrinsic with a call
   // to the corresponding function from the alternate math library.
@@ -154,6 +178,12 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
     }
   }
 
+  if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
+    bool ToReturn = replaceWithNVPTXCalls(BuiltinCall);
+    if (ToReturn)
+      return true;
+  }
+
   /// Call TLI to select a function implementation to call
   StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall);
   if (ImplName.empty()) {

From 58269113517c55e2cd09438c4013a86a761ea57d Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Tue, 21 Jan 2025 06:15:57 -0800
Subject: [PATCH 02/16] [SYCL][NVPTX] Set default fdiv and sqrt for
 llvm.fpbuiltin for 3.0 max-error

We are lacking implementation for llvm.fpbuiltin intrinsics for NVPTX
target. This patch adds type-and fast-math- dependent mapping for
llvm.fpbuiltin.fdiv and llvm.fpbuiltin.sqrt with 3.0 max-error on nvvm
intrinsics:
fp32 scalar @llvm.fpbuiltin.fdiv -> @llvm.nvvm.div.approx.f
fp32 scalar @llvm.fpbuiltin.fdiv fast -> @llvm.nvvm.div.approx.ftz.f
fp32 scalar @llvm.fpbuiltin.sqrt -> @llvm.nvvm.sqrt.approx.f
fp32 scalar @llvm.fpbuiltin.sqrt fast -> @llvm.nvvm.sqrt.approx.ftz.f

vector or non-fp32 scalar llvm.fpbuiltin.fdiv -> fdiv
vector or non-fp32 scalar llvm.fpbuiltin.sqrt -> llvm.sqrt

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../Scalar/FPBuiltinFnSelection.cpp           | 73 +++++++++++------
 ...p-builtin-intrinsics-nvvm-max-error-3.0.ll | 81 +++++++++++++++++++
 2 files changed, 128 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 658d2965aff1b..56755f1c775dd 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -64,30 +65,6 @@ static bool replaceWithAltMathFunction(FPBuiltinIntrinsic &BuiltinCall,
   return true;
 }
 
-static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
-  IRBuilder<> IRBuilder(&BuiltinCall);
-  SmallVector<Value *> Args(BuiltinCall.args());
-  Value *Replacement = nullptr;
-  switch (BuiltinCall.getIntrinsicID()) {
-  case Intrinsic::fpbuiltin_fdiv:
-    Replacement = IRBuilder.CreateFDiv(Args[0], Args[1]);
-    break;
-  case Intrinsic::fpbuiltin_sqrt:
-    Replacement =
-        IRBuilder.CreateIntrinsic(BuiltinCall.getType(), Intrinsic::sqrt, Args);
-    break;
-  default:
-    return false;
-  }
-  BuiltinCall.replaceAllUsesWith(Replacement);
-  cast<Instruction>(Replacement)->copyFastMathFlags(&BuiltinCall);
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << BuiltinCall.getCalledFunction()->getName()
-                    << "` with equivalent IR. \n `");
-  return true;
-
-}
-
 static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
   // Replace the call to the fpbuiltin intrinsic with a call
   // to the corresponding function from the alternate math library.
@@ -130,6 +107,48 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
   return true;
 }
 
+static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
+  IRBuilder<> IRBuilder(&BuiltinCall);
+  SmallVector<Value *> Args(BuiltinCall.args());
+  Value *Replacement = nullptr;
+  // To chose between ftz and non-ftz intrinsic.
+  FastMathFlags FMF = BuiltinCall.getFastMathFlags();
+  auto *Type = BuiltinCall.getType();
+  // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have
+  // approximate variants for sin, cos, exp2 and log2.
+  // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, use
+  // standart for LLVM math operations. Also nvvm fdiv and sqrt intrisics
+  // support only float type.
+  switch (BuiltinCall.getIntrinsicID()) {
+  case Intrinsic::fpbuiltin_fdiv:
+    if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
+      return replaceWithLLVMIR(BuiltinCall);
+    Replacement =
+        IRBuilder.CreateIntrinsic(Type,
+                                  FMF.isFast()
+                                  ? Intrinsic::nvvm_div_approx_ftz_f
+                                  : Intrinsic::nvvm_div_approx_f, Args);
+    break;
+  case Intrinsic::fpbuiltin_sqrt:
+    if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
+      return replaceWithLLVMIR(BuiltinCall);
+    Replacement =
+        IRBuilder.CreateIntrinsic(BuiltinCall.getType(),
+                                  FMF.isFast()
+                                  ? Intrinsic::nvvm_sqrt_approx_ftz_f
+                                  : Intrinsic::nvvm_sqrt_approx_f, Args);
+    break;
+  default:
+    return false;
+  }
+  BuiltinCall.replaceAllUsesWith(Replacement);
+  cast<Instruction>(Replacement)->copyFastMathFlags(&BuiltinCall);
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << BuiltinCall.getCalledFunction()->getName()
+                    << "` with equivalent IR. \n `");
+  return true;
+}
+
 static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
                                       const TargetTransformInfo &TTI,
                                       FPBuiltinIntrinsic &BuiltinCall) {
@@ -178,9 +197,11 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
     }
   }
 
+  // We don't have implementation for CUDA approximate precision builtins.
+  // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
+  // - skip to replaceWithAltMathFunction.
   if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
-    bool ToReturn = replaceWithNVPTXCalls(BuiltinCall);
-    if (ToReturn)
+    if (replaceWithNVPTXCalls(BuiltinCall))
       return true;
   }
 
diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
new file mode 100644
index 0000000000000..346e3475b5d75
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
@@ -0,0 +1,81 @@
+; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: @test_fdiv
+; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}})
+; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}}
+define void @test_fdiv(float %d1, <2 x float> %v2d1,
+                       float %d2, <2 x float> %v2d2) {
+entry:
+  %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0
+  %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0
+  ret void
+}
+
+; CHECK-LABEL: @test_fdiv_fast
+; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.ftz.f(float %{{.*}}, float %{{.*}})
+; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}}
+define void @test_fdiv_fast(float %d1, <2 x float> %v2d1,
+                            float %d2, <2 x float> %v2d2) {
+entry:
+  %t0 = call fast float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0
+  %t1 = call fast <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.fdiv.f32(float, float)
+declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>)
+
+; CHECK-LABEL: @test_fdiv_double
+; CHECK: %{{.*}} = fdiv double %{{.*}}, %{{.*}}
+; CHECK: %{{.*}} = fdiv <2 x double> %{{.*}}, %{{.*}}
+define void @test_fdiv_double(double %d1, <2 x double> %v2d1,
+                              double %d2, <2 x double> %v2d2) {
+entry:
+  %t0 = call double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0
+  %t1 = call <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  ret void
+}
+
+declare double @llvm.fpbuiltin.fdiv.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>)
+
+; CHECK-LABEL: @test_sqrt
+; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}})
+; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) {
+entry:
+  %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #0
+  %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
+  ret void
+}
+
+; CHECK-LABEL: @test_sqrt_fast
+; CHECK: %{{.*}} = call fast float @llvm.nvvm.sqrt.approx.ftz.f(float %{{.*}})
+; CHECK: %{{.*}} = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+define void @test_sqrt_fast(float %d, <2 x float> %v2d, <4 x float> %v4d) {
+entry:
+  %t0 = call fast float @llvm.fpbuiltin.sqrt.f32(float %d) #0
+  %t1 = call fast <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.sqrt.f32(float)
+declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>)
+
+; CHECK-LABEL: @test_sqrt_double
+; CHECK: %{{.*}} = call double @llvm.sqrt.f64(double %{{.*}})
+; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
+define void @test_sqrt_double(double %d, <2 x double> %v2d) {
+entry:
+  %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0
+  %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0
+  ret void
+}
+
+declare double @llvm.fpbuiltin.sqrt.f64(double)
+declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
+
+attributes #0 = { "fpbuiltin-max-error"="3.0" }

From ee6428759f1883ca8409c2ab9d5dd1bf363c5eb0 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 22 Jan 2025 04:25:38 -0800
Subject: [PATCH 03/16] apply comments

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../Transforms/Scalar/FPBuiltinFnSelection.cpp  | 17 ++++++-----------
 .../fp-builtin-intrinsics-nvvm-max-error-3.0.ll | 12 +-----------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 56755f1c775dd..bc67518e27f80 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -107,12 +107,12 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
   return true;
 }
 
-static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
+// This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error
+// attribute to the appropriate nvvm approximate intrinsics if it's possible.
+static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
   IRBuilder<> IRBuilder(&BuiltinCall);
   SmallVector<Value *> Args(BuiltinCall.args());
   Value *Replacement = nullptr;
-  // To chose between ftz and non-ftz intrinsic.
-  FastMathFlags FMF = BuiltinCall.getFastMathFlags();
   auto *Type = BuiltinCall.getType();
   // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have
   // approximate variants for sin, cos, exp2 and log2.
@@ -124,19 +124,14 @@ static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
     Replacement =
-        IRBuilder.CreateIntrinsic(Type,
-                                  FMF.isFast()
-                                  ? Intrinsic::nvvm_div_approx_ftz_f
-                                  : Intrinsic::nvvm_div_approx_f, Args);
+        IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args);
     break;
   case Intrinsic::fpbuiltin_sqrt:
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
     Replacement =
         IRBuilder.CreateIntrinsic(BuiltinCall.getType(),
-                                  FMF.isFast()
-                                  ? Intrinsic::nvvm_sqrt_approx_ftz_f
-                                  : Intrinsic::nvvm_sqrt_approx_f, Args);
+                                  Intrinsic::nvvm_sqrt_approx_f, Args);
     break;
   default:
     return false;
@@ -201,7 +196,7 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
   // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
   // - skip to replaceWithAltMathFunction.
   if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
-    if (replaceWithNVPTXCalls(BuiltinCall))
+    if (replaceWithApproxNVPTXCalls(BuiltinCall))
       return true;
   }
 
diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
index 346e3475b5d75..0827c668b8609 100644
--- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
@@ -15,7 +15,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test_fdiv_fast
-; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.ftz.f(float %{{.*}}, float %{{.*}})
+; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}})
 ; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}}
 define void @test_fdiv_fast(float %d1, <2 x float> %v2d1,
                             float %d2, <2 x float> %v2d2) {
@@ -52,16 +52,6 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @test_sqrt_fast
-; CHECK: %{{.*}} = call fast float @llvm.nvvm.sqrt.approx.ftz.f(float %{{.*}})
-; CHECK: %{{.*}} = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
-define void @test_sqrt_fast(float %d, <2 x float> %v2d, <4 x float> %v4d) {
-entry:
-  %t0 = call fast float @llvm.fpbuiltin.sqrt.f32(float %d) #0
-  %t1 = call fast <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
-  ret void
-}
-
 declare float @llvm.fpbuiltin.sqrt.f32(float)
 declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>)
 

From b16492f0b37ffbd28b86778ff61f7759652d192f Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 22 Jan 2025 04:35:29 -0800
Subject: [PATCH 04/16] format

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index bc67518e27f80..bdaa253d9ce94 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -129,9 +129,8 @@ static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
   case Intrinsic::fpbuiltin_sqrt:
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
-    Replacement =
-        IRBuilder.CreateIntrinsic(BuiltinCall.getType(),
-                                  Intrinsic::nvvm_sqrt_approx_f, Args);
+    Replacement = IRBuilder.CreateIntrinsic(
+        BuiltinCall.getType(), Intrinsic::nvvm_sqrt_approx_f, Args);
     break;
   default:
     return false;

From 01b30325b2818f5a6145c49a09a446eb5cf074d0 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Thu, 23 Jan 2025 03:25:57 -0800
Subject: [PATCH 05/16] rename

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index bdaa253d9ce94..0e1a1510897b7 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -109,7 +109,9 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
 
 // This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error
 // attribute to the appropriate nvvm approximate intrinsics if it's possible.
-static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
+// If it's not possible - fallback to standart LLVM intrinsic or instruction.
+static bool replaceWithApproxNVPTXCallsOrFallback(
+    FPBuiltinIntrinsic &BuiltinCall) {
   IRBuilder<> IRBuilder(&BuiltinCall);
   SmallVector<Value *> Args(BuiltinCall.args());
   Value *Replacement = nullptr;
@@ -195,7 +197,7 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
   // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
   // - skip to replaceWithAltMathFunction.
   if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
-    if (replaceWithApproxNVPTXCalls(BuiltinCall))
+    if (replaceWithApproxNVPTXCallsOrFallback(BuiltinCall))
       return true;
   }
 

From 5ae0e94005b76e453d97848f9fbecbbdbfb7c52d Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Thu, 23 Jan 2025 04:17:50 -0800
Subject: [PATCH 06/16] typo

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 0e1a1510897b7..6a5891f7c5236 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -109,7 +109,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
 
 // This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error
 // attribute to the appropriate nvvm approximate intrinsics if it's possible.
-// If it's not possible - fallback to standart LLVM intrinsic or instruction.
+// If it's not possible - fallback to standard C/C++ library LLVM intrinsic or
+// instruction.
 static bool replaceWithApproxNVPTXCallsOrFallback(
     FPBuiltinIntrinsic &BuiltinCall) {
   IRBuilder<> IRBuilder(&BuiltinCall);

From 6720ed0ad6aed5968dfda390800d50f6e807cff3 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Thu, 23 Jan 2025 04:21:20 -0800
Subject: [PATCH 07/16] typo

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 6a5891f7c5236..c21473a74e758 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -109,8 +109,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
 
 // This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error
 // attribute to the appropriate nvvm approximate intrinsics if it's possible.
-// If it's not possible - fallback to standard C/C++ library LLVM intrinsic or
-// instruction.
+// If it's not possible - fallback to instruction or standard C/C++ library LLVM
+// intrinsic.
 static bool replaceWithApproxNVPTXCallsOrFallback(
     FPBuiltinIntrinsic &BuiltinCall) {
   IRBuilder<> IRBuilder(&BuiltinCall);
@@ -119,9 +119,10 @@ static bool replaceWithApproxNVPTXCallsOrFallback(
   auto *Type = BuiltinCall.getType();
   // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have
   // approximate variants for sin, cos, exp2 and log2.
-  // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, use
-  // standart for LLVM math operations. Also nvvm fdiv and sqrt intrisics
-  // support only float type.
+  // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics,
+  // fallback to instruction or standard C/C++ library LLVM intrinsic. Also
+  // nvvm fdiv and sqrt intrisics support only float type, so fallback in this
+  // case as well.
   switch (BuiltinCall.getIntrinsicID()) {
   case Intrinsic::fpbuiltin_fdiv:
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())

From 5b6411ab23ee9504ce7fc8bdf402af4d20913110 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Thu, 23 Jan 2025 06:02:27 -0800
Subject: [PATCH 08/16] formt

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index c21473a74e758..7719de44b5f0c 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -111,8 +111,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
 // attribute to the appropriate nvvm approximate intrinsics if it's possible.
 // If it's not possible - fallback to instruction or standard C/C++ library LLVM
 // intrinsic.
-static bool replaceWithApproxNVPTXCallsOrFallback(
-    FPBuiltinIntrinsic &BuiltinCall) {
+static bool
+replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall) {
   IRBuilder<> IRBuilder(&BuiltinCall);
   SmallVector<Value *> Args(BuiltinCall.args());
   Value *Replacement = nullptr;

From 32d8f6bbcf7248106b33474f84eb758b2478ac0f Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Tue, 28 Jan 2025 03:30:37 -0800
Subject: [PATCH 09/16] wip

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 7719de44b5f0c..17003a0c9e5f5 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -112,7 +112,8 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
 // If it's not possible - fallback to instruction or standard C/C++ library LLVM
 // intrinsic.
 static bool
-replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall) {
+replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall,
+                                      std::optional<float> Accuracy) {
   IRBuilder<> IRBuilder(&BuiltinCall);
   SmallVector<Value *> Args(BuiltinCall.args());
   Value *Replacement = nullptr;
@@ -198,8 +199,9 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
   // We don't have implementation for CUDA approximate precision builtins.
   // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
   // - skip to replaceWithAltMathFunction.
-  if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
-    if (replaceWithApproxNVPTXCallsOrFallback(BuiltinCall))
+  if (T.isNVPTX())
+    if (replaceWithApproxNVPTXCallsOrFallback(
+          BuiltinCall, BuiltinCall.getRequiredAccuracy()))
       return true;
   }
 

From 36f86888e50bf622f8ab9bd6c8dbbf782d397d89 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Tue, 28 Jan 2025 04:21:14 -0800
Subject: [PATCH 10/16] fix bug, apply comment

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp   |  9 ++++++---
 ...or-3.0.ll => fp-builtin-intrinsics-nvvm-approx.ll} | 11 ++++++-----
 2 files changed, 12 insertions(+), 8 deletions(-)
 rename llvm/test/CodeGen/NVPTX/{fp-builtin-intrinsics-nvvm-max-error-3.0.ll => fp-builtin-intrinsics-nvvm-approx.ll} (92%)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 17003a0c9e5f5..d69cc456f4086 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -107,7 +107,7 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
   return true;
 }
 
-// This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error
+// This function lowers llvm.fpbuiltin. intrinsic functions with max-error
 // attribute to the appropriate nvvm approximate intrinsics if it's possible.
 // If it's not possible - fallback to instruction or standard C/C++ library LLVM
 // intrinsic.
@@ -126,12 +126,16 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall,
   // case as well.
   switch (BuiltinCall.getIntrinsicID()) {
   case Intrinsic::fpbuiltin_fdiv:
+    if (Accuracy.value() != 2.5)
+      return false;
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
     Replacement =
         IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args);
     break;
   case Intrinsic::fpbuiltin_sqrt:
+    if (Accuracy.value() != 3.0)
+      return false;
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
     Replacement = IRBuilder.CreateIntrinsic(
@@ -199,11 +203,10 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
   // We don't have implementation for CUDA approximate precision builtins.
   // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
   // - skip to replaceWithAltMathFunction.
-  if (T.isNVPTX())
+  if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() != 0.5)
     if (replaceWithApproxNVPTXCallsOrFallback(
           BuiltinCall, BuiltinCall.getRequiredAccuracy()))
       return true;
-  }
 
   /// Call TLI to select a function implementation to call
   StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall);
diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
similarity index 92%
rename from llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
rename to llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
index 0827c668b8609..51b18b4e3bab6 100644
--- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
@@ -47,8 +47,8 @@ declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>)
 ; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
 define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) {
 entry:
-  %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #0
-  %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
+  %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #1
+  %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #1
   ret void
 }
 
@@ -60,12 +60,13 @@ declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>)
 ; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
 define void @test_sqrt_double(double %d, <2 x double> %v2d) {
 entry:
-  %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0
-  %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0
+  %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #1
+  %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #1
   ret void
 }
 
 declare double @llvm.fpbuiltin.sqrt.f64(double)
 declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
 
-attributes #0 = { "fpbuiltin-max-error"="3.0" }
+attributes #0 = { "fpbuiltin-max-error"="2.5" }
+attributes #1 = { "fpbuiltin-max-error"="3.0" }

From cc7333f60bcf4f12d1046e559304f2d68666673c Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Tue, 28 Jan 2025 04:30:42 -0800
Subject: [PATCH 11/16] format

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index d69cc456f4086..1b88e70cb231e 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -205,7 +205,7 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
   // - skip to replaceWithAltMathFunction.
   if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() != 0.5)
     if (replaceWithApproxNVPTXCallsOrFallback(
-          BuiltinCall, BuiltinCall.getRequiredAccuracy()))
+            BuiltinCall, BuiltinCall.getRequiredAccuracy()))
       return true;
 
   /// Call TLI to select a function implementation to call

From 9ea17f831e83d4d37a64a4323f2fcc9dc9ff7607 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 29 Jan 2025 05:13:41 -0800
Subject: [PATCH 12/16] Address comment and add 0.5 max error for nvptx

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../Scalar/FPBuiltinFnSelection.cpp           |  21 +-
 ...p-builtin-intrinsics-nvvm-max-error-0.5.ll | 219 ++++++++++++++++++
 2 files changed, 230 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index 1b88e70cb231e..f742b6243bdfd 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -126,7 +126,7 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall,
   // case as well.
   switch (BuiltinCall.getIntrinsicID()) {
   case Intrinsic::fpbuiltin_fdiv:
-    if (Accuracy.value() != 2.5)
+    if (Accuracy.value() < 2.0)
       return false;
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
@@ -134,7 +134,7 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall,
         IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args);
     break;
   case Intrinsic::fpbuiltin_sqrt:
-    if (Accuracy.value() != 3.0)
+    if (Accuracy.value() < 1.0)
       return false;
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
@@ -182,10 +182,11 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
     return replaceWithLLVMIR(BuiltinCall);
 
   // Several functions for "sycl" and "cuda" requires "0.5" accuracy levels,
-  // which means correctly rounded results. For now x86 host AltMathLibrary
-  // doesn't have such ability. For such accuracy level, the fpbuiltins
-  // should be replaced by equivalent IR operation or llvmbuiltins.
-  if (T.isX86() && BuiltinCall.getRequiredAccuracy().value() == 0.5) {
+  // which means correctly rounded results. For now x86 host and NVPTX
+  // AltMathLibrary doesn't have such ability. For such accuracy level, the
+  // fpbuiltins should be replaced by equivalent IR operation or llvmbuiltins.
+  if ((T.isX86() || T.isNVPTX()) &&
+      BuiltinCall.getRequiredAccuracy().value() == 0.5) {
     switch (BuiltinCall.getIntrinsicID()) {
     case Intrinsic::fpbuiltin_fadd:
     case Intrinsic::fpbuiltin_fsub:
@@ -200,10 +201,10 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
     }
   }
 
-  // We don't have implementation for CUDA approximate precision builtins.
-  // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
-  // - skip to replaceWithAltMathFunction.
-  if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() != 0.5)
+  // AltMathLibrary don't have implementation for CUDA approximate precision
+  // builtins. Lets map them on NVPTX intrinsics. If no appropriate intrinsics
+  // are known - skip to emit an error.
+  if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() > 0.5)
     if (replaceWithApproxNVPTXCallsOrFallback(
             BuiltinCall, BuiltinCall.getRequiredAccuracy()))
       return true;
diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll
new file mode 100644
index 0000000000000..af1f00dee16c0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll
@@ -0,0 +1,219 @@
+; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s
+
+; Several functions for "sycl" and "cuda" requires "0.5" accuracy levels,
+; Test if these fpbuiltins could be replaced by equivalaent IR operations
+; or llvm builtins.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: @svml_fadd
+; CHECK: %0 = fadd fast float %f1, %f2
+; CHECK: %1 = fadd fast <4 x float> %v4f1, %v4f2
+; CHECK: %2 = fadd fast <8 x float> %v8f1, %v8f2
+; CHECK: %3 = fadd fast <16 x float> %v16f1, %v16f2
+; CHECK: %4 = fadd fast double %d1, %d2
+; CHECK: %5 = fadd fast <2 x double> %v2d1, %v2d2
+; CHECK: %6 = fadd fast <4 x double> %v4d1, %v4d2
+; CHECK: %7 = fadd fast <8 x double> %v8d1, %v8d2
+define void @svml_fadd(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
+                       float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
+                       double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
+                       double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
+entry:
+  %t0_0 = call fast float @llvm.fpbuiltin.fadd.f32(float %f1, float %f2) #0
+  %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
+  %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
+  %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
+  %t4_0 = call fast double @llvm.fpbuiltin.fadd.f64(double %d1, double %d2) #0
+  %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
+  %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.fadd.f32(float, float)
+declare <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float>, <8 x float>)
+declare <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float>, <16 x float>)
+declare double @llvm.fpbuiltin.fadd.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @svml_fsub
+; CHECK: %0 = fsub fast float %f1, %f2
+; CHECK: %1 = fsub fast <4 x float> %v4f1, %v4f2
+; CHECK: %2 = fsub fast <8 x float> %v8f1, %v8f2
+; CHECK: %3 = fsub fast <16 x float> %v16f1, %v16f2
+; CHECK: %4 = fsub fast double %d1, %d2
+; CHECK: %5 = fsub fast <2 x double> %v2d1, %v2d2
+; CHECK: %6 = fsub fast <4 x double> %v4d1, %v4d2
+; CHECK: %7 = fsub fast <8 x double> %v8d1, %v8d2
+define void @svml_fsub(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
+                       float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
+                       double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
+                       double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
+entry:
+  %t0_0 = call fast float @llvm.fpbuiltin.fsub.f32(float %f1, float %f2) #0
+  %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
+  %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
+  %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
+  %t4_0 = call fast double @llvm.fpbuiltin.fsub.f64(double %d1, double %d2) #0
+  %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
+  %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.fsub.f32(float, float)
+declare <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float>, <8 x float>)
+declare <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float>, <16 x float>)
+declare double @llvm.fpbuiltin.fsub.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @svml_fmul
+; CHECK: %0 = fmul fast float %f1, %f2
+; CHECK: %1 = fmul fast <4 x float> %v4f1, %v4f2
+; CHECK: %2 = fmul fast <8 x float> %v8f1, %v8f2
+; CHECK: %3 = fmul fast <16 x float> %v16f1, %v16f2
+; CHECK: %4 = fmul fast double %d1, %d2
+; CHECK: %5 = fmul fast <2 x double> %v2d1, %v2d2
+; CHECK: %6 = fmul fast <4 x double> %v4d1, %v4d2
+; CHECK: %7 = fmul fast <8 x double> %v8d1, %v8d2
+define void @svml_fmul(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
+                       float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
+                       double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
+                       double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
+entry:
+  %t0_0 = call fast float @llvm.fpbuiltin.fmul.f32(float %f1, float %f2) #0
+  %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
+  %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
+  %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
+  %t4_0 = call fast double @llvm.fpbuiltin.fmul.f64(double %d1, double %d2) #0
+  %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
+  %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.fmul.f32(float, float)
+declare <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float>, <8 x float>)
+declare <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float>, <16 x float>)
+declare double @llvm.fpbuiltin.fmul.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @svml_fdiv
+; CHECK: %0 = fdiv fast double %d1, %d2
+; CHECK: %1 = fdiv fast <2 x double> %v2d1, %v2d2
+; CHECK: %2 = fdiv fast <4 x double> %v4d1, %v4d2
+; CHECK: %3 = fdiv fast <8 x double> %v8d1, %v8d2
+define void @svml_fdiv(double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
+                       double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
+entry:
+  %t0_0 = call fast double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0
+  %t1_0 = call fast <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  %t2_0 = call fast <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
+  %t3_0 = call fast <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
+  ret void
+}
+
+declare double @llvm.fpbuiltin.fdiv.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @svml_frem
+; CHECK: %0 = frem fast float %f1, %f2
+; CHECK: %1 = frem fast <4 x float> %v4f1, %v4f2
+; CHECK: %2 = frem fast <8 x float> %v8f1, %v8f2
+; CHECK: %3 = frem fast <16 x float> %v16f1, %v16f2
+; CHECK: %4 = frem fast double %d1, %d2
+; CHECK: %5 = frem fast <2 x double> %v2d1, %v2d2
+; CHECK: %6 = frem fast <4 x double> %v4d1, %v4d2
+; CHECK: %7 = frem fast <8 x double> %v8d1, %v8d2
+define void @svml_frem(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
+                       float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
+                       double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
+                       double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
+entry:
+  %t0_0 = call fast float @llvm.fpbuiltin.frem.f32(float %f1, float %f2) #0
+  %t1_0 = call fast <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
+  %t2_0 = call fast <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
+  %t3_0 = call fast <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
+  %t4_0 = call fast double @llvm.fpbuiltin.frem.f64(double %d1, double %d2) #0
+  %t5_0 = call fast <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  %t6_0 = call fast <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
+  %t7_0 = call fast <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.frem.f32(float, float)
+declare <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float>, <8 x float>)
+declare <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float>, <16 x float>)
+declare double @llvm.fpbuiltin.frem.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @svml_sqrt
+; CHECK: %0 = call double @llvm.sqrt.f64(double %d)
+; CHECK: %1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %v2d)
+; CHECK: %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %v4d)
+; CHECK: %3 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %v8d)
+define void @svml_sqrt(double %d, <2 x double> %v2d, <4 x double> %v4d, <8 x double> %v8d) {
+entry:
+  %t4_0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0
+  %t5_0 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0
+  %t6_0 = call <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double> %v4d) #0
+  %t7_0 = call <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double> %v8d) #0
+  ret void
+}
+
+declare double @llvm.fpbuiltin.sqrt.f64(double)
+declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
+declare <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double>)
+declare <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double>)
+
+; CHECK-LABEL: @svml_ldexp
+; CHECK: %0 = call fast float @llvm.ldexp.f32.i32(float %f1, i32 %f2)
+; CHECK: %1 = call fast <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2)
+; CHECK: %2 = call fast <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2)
+; CHECK: %3 = call fast <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2)
+; CHECK: %4 = call fast double @llvm.ldexp.f64.i32(double %d1, i32 %d2)
+; CHECK: %5 = call fast <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2)
+; CHECK: %6 = call fast <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2)
+; CHECK: %7 = call fast <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2)
+define void @svml_ldexp(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
+                        i32 %f2, <4 x i32> %v4f2, <8 x i32> %v8f2, <16 x i32> %v16f2,
+                        double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
+                        i32 %d2, <2 x i32> %v2d2, <4 x i32> %v4d2, <8 x i32> %v8d2) {
+entry:
+  %t0_0 = call fast float @llvm.fpbuiltin.ldexp.f32.i32(float %f1, i32 %f2) #0
+  %t1_0 = call fast <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2) #0
+  %t2_0 = call fast <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2) #0
+  %t3_0 = call fast <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2) #0
+  %t4_0 = call fast double @llvm.fpbuiltin.ldexp.f64.i32(double %d1, i32 %d2) #0
+  %t5_0 = call fast <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2) #0
+  %t6_0 = call fast <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2) #0
+  %t7_0 = call fast <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.ldexp.f32.i32(float, i32)
+declare <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
+declare <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>)
+declare <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>)
+declare double @llvm.fpbuiltin.ldexp.f64.i32(double, i32)
+declare <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
+declare <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
+declare <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
+
+attributes #0 = { "fpbuiltin-max-error"="0.5" }

From 700ae48f49bd9e0d23f52b753c2503eedbb10959 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 29 Jan 2025 06:07:57 -0800
Subject: [PATCH 13/16] format

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
index f742b6243bdfd..b64241d2fd809 100644
--- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
+++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
@@ -138,8 +138,8 @@ replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall,
       return false;
     if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
       return replaceWithLLVMIR(BuiltinCall);
-    Replacement = IRBuilder.CreateIntrinsic(
-        BuiltinCall.getType(), Intrinsic::nvvm_sqrt_approx_f, Args);
+    Replacement =
+        IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_sqrt_approx_f, Args);
     break;
   default:
     return false;

From 2df0a72ec97416d06c6ed0a23f3f83f0531e7219 Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 29 Jan 2025 10:32:01 -0800
Subject: [PATCH 14/16] apply suggestions

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll
index af1f00dee16c0..3777ad8c52b6d 100644
--- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s
 
-; Several functions for "sycl" and "cuda" requires "0.5" accuracy levels,
+; Several functions for SYCL and CUDA requires "0.5" accuracy levels,
 ; Test if these fpbuiltins could be replaced by equivalaent IR operations
-; or llvm builtins.
+; or LLVM builtins.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

From e86be33d61863f66f3238a2362398650c0273eac Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 29 Jan 2025 13:43:39 -0800
Subject: [PATCH 15/16] add test

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../fp-builtin-intrinsics-nvvm-approx.ll      | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
index 51b18b4e3bab6..d341053058a22 100644
--- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
@@ -25,6 +25,17 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: @test_fdiv_max_error
+; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}})
+; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}}
+define void @test_fdiv_max_error(float %d1, <2 x float> %v2d1,
+                                 float %d2, <2 x float> %v2d2) {
+entry:
+  %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #2
+  %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #2
+  ret void
+}
+
 declare float @llvm.fpbuiltin.fdiv.f32(float, float)
 declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>)
 
@@ -52,6 +63,16 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: @test_sqrt_max_error
+; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}})
+; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+define void @test_sqrt_max_error(float %d, <2 x float> %v2d, <4 x float> %v4d) {
+entry:
+  %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #2
+  %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #2
+  ret void
+}
+
 declare float @llvm.fpbuiltin.sqrt.f32(float)
 declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>)
 
@@ -70,3 +91,4 @@ declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
 
 attributes #0 = { "fpbuiltin-max-error"="2.5" }
 attributes #1 = { "fpbuiltin-max-error"="3.0" }
+attributes #1 = { "fpbuiltin-max-error"="10.0" }

From 6113cc3f4ace250d8da3cab6eaf83fabc87fec7c Mon Sep 17 00:00:00 2001
From: "Sidorov, Dmitry" <dmitry.sidorov@intel.com>
Date: Wed, 29 Jan 2025 16:24:32 -0800
Subject: [PATCH 16/16] fix typo

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
index d341053058a22..6c7ce8af804d9 100644
--- a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll
@@ -91,4 +91,4 @@ declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
 
 attributes #0 = { "fpbuiltin-max-error"="2.5" }
 attributes #1 = { "fpbuiltin-max-error"="3.0" }
-attributes #1 = { "fpbuiltin-max-error"="10.0" }
+attributes #2 = { "fpbuiltin-max-error"="10.0" }