[Codegen] Add asin domain check and align NaN‐handling in pooling with CUDA semantics

vacu9708 · vacu9708 · commit 1856d9e76b08 · 2025-05-19T20:56:37.000+09:00
- Update `tir.asin` to return quiet NaN if the input is outside of [-1, 1].
- Update LLVM codegen for `max/min` (used in pooling) to
align CPU behavior with CUDA when handling NaN values.
- Modified the regex in the aarch64 codegen test code to also match NaN-suppressing fminnm/fmaxnm as well as fmin/fmax
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
@@ -1623,12 +1623,24 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const MinNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
+
+  // IEEE-754 minNum keeps the numeric value if one input is NaN
+  if (op->a.dtype().is_float() && op->a.dtype().bits() >= 32) {
+    return builder_->CreateBinaryIntrinsic(llvm::Intrinsic::minnum, a, b);
+  }
+  // For integer types
   return builder_->CreateSelect(CreateLT(op->a.dtype(), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const MaxNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
+
+  // IEEE-754 maxNum keeps the numeric value if one input is NaN
+  if (op->a.dtype().is_float() && op->a.dtype().bits() >= 32) {
+    return builder_->CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, a, b);
+  }
+  // For integer types
   return builder_->CreateSelect(CreateGT(op->a.dtype(), a, b), a, b);
 }
 
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
@@ -30,6 +30,8 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include <limits>
+
 #include "../intrin_rule.h"
 
 namespace tvm {
@@ -175,7 +177,15 @@ TVM_REGISTER_OP("tir.asin")
       PrimExpr term7 = term5 * x2 * make_const(x.dtype(), 25) / make_const(x.dtype(), 112);
       PrimExpr term9 = term7 * x2 * make_const(x.dtype(), 1225) / make_const(x.dtype(), 3456);
       PrimExpr term11 = term9 * x2 * make_const(x.dtype(), 3969) / make_const(x.dtype(), 28160);
-      return term1 + term3 + term5 + term7 + term9 + term11;
+      PrimExpr series = term1 + term3 + term5 + term7 + term9 + term11;
+      /* --- domain limit check --- */
+      PrimExpr lower = make_const(x.dtype(), -1.0);
+      PrimExpr upper = make_const(x.dtype(), 1.0);
+      PrimExpr out_range = tir::Or(x<lower, x> upper);
+      // Use a quiet NaN constant
+      PrimExpr nan_const = make_const(x.dtype(), std::numeric_limits<double>::quiet_NaN());
+      // select: if out of [-1,1] → NaN, else → series
+      return tir::Select(out_range, nan_const, series);
     });
 
 TVM_REGISTER_OP("tir.acos")
diff --git a/tests/python/codegen/test_target_codegen_aarch64.py b/tests/python/codegen/test_target_codegen_aarch64.py
@@ -184,7 +184,7 @@ def check_correct_assembly(type):
         )
         select = re.findall("sel\tz[0-9].[shdb], p[0-9], z[0-9].[shdb], z[0-9].[shdb]", assembly)
         max = re.findall(
-            r"max\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly
+            r"f?max(?:nm)?\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly
         )
 
         assert len(loads) > 1
@@ -220,7 +220,7 @@ def check_correct_assembly(type):
         )
         select = re.findall("sel\tz[0-9].[shdb], p[0-9], z[0-9].[shdb], z[0-9].[shdb]", assembly)
         min = re.findall(
-            r"min\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly
+            r"f?min(?:nm)?\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly
         )
 
         assert len(loads) > 1
diff --git a/tests/python/tir-base/test_tir_intrin.py b/tests/python/tir-base/test_tir_intrin.py
@@ -100,6 +100,23 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
         func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=atol, rtol=rtol)
 
+        # Out‐of‐bounds test for asin/acos
+        name = tvm_intrin.__name__
+        if name in ("asin", "acos"):
+            # generate some values outside [-1, 1]
+            n = 8
+            out_np = np.concatenate(
+                [
+                    np.random.uniform(1.1, 2.0, size=n // 2),
+                    np.random.uniform(-2.0, -1.1, size=n // 2),
+                ]
+            ).astype(A.dtype)
+            a2 = tvm.nd.array(out_np, dev)
+            b2 = tvm.nd.array(np.empty_like(out_np), dev)
+            func(a2, b2)
+            # all outputs should be NaN
+            assert np.all(np.isnan(b2.numpy()))
+
     for func in test_funcs:
         atol = rtol = 1e-3 if func[0].__name__ in ["asin", "acos", "atan"] else 1e-5
         run_test(*func, atol, rtol)
diff --git a/tests/python/tir-transform/test_tir_transform_lower_intrin.py b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
@@ -117,6 +117,46 @@ def test_lower_floormod():
         check_value(res, x, y, [(a, b) for a, b in data if b == 8], lambda a, b: a % b)
 
 
+# Max / Min NaN-handling (IEEE-754 maxNum / minNum)
+@tvm.testing.requires_llvm
+def test_lower_maxmin_nan():
+    def get_fp_data():
+        # covers (a, b), (a, NaN), (NaN, b), (NaN, NaN)
+        x_vals = [-3.0, 0.0, 7.5, np.nan]
+        y_vals = [2.0, np.nan, -8.0, np.nan]
+        return list(zip(x_vals, y_vals))
+
+    data = get_fp_data()
+    for dtype in ["float32", "float64"]:
+        x = te.var("x", dtype=dtype)
+        y = te.var("y", dtype=dtype)
+
+        res_max = lower_intrin([x, y], tvm.te.max(x, y))
+
+        def ref_max(a, b):
+            # IEEE-754 maxNum semantics
+            if np.isnan(a):
+                return b
+            if np.isnan(b):
+                return a
+            return max(a, b)
+
+        check_value(res_max, x, y, data, ref_max)
+
+        res_min = lower_intrin([x, y], tvm.te.min(x, y))
+
+        def ref_min(a, b):
+            # IEEE-754 minNum semantics
+            if np.isnan(a):
+                return b
+            if np.isnan(b):
+                return a
+            return min(a, b)
+
+        check_value(res_min, x, y, data, ref_min)
+
+
 if __name__ == "__main__":
     test_lower_floordiv()
     test_lower_floormod()
+    test_lower_maxmin_nan()

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,7 @@ def check_correct_assembly(type):`
`184`	`184`	`)`
`185`	`185`	`select = re.findall("sel\tz[0-9].[shdb], p[0-9], z[0-9].[shdb], z[0-9].[shdb]", assembly)`
`186`	`186`	`max = re.findall(`
`187`		`- r"max\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly`
	`187`	`+ r"f?max(?:nm)?\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly`
`188`	`188`	`)`
`189`	`189`
`190`	`190`	`assert len(loads) > 1`
`@@ -220,7 +220,7 @@ def check_correct_assembly(type):`
`220`	`220`	`)`
`221`	`221`	`select = re.findall("sel\tz[0-9].[shdb], p[0-9], z[0-9].[shdb], z[0-9].[shdb]", assembly)`
`222`	`222`	`min = re.findall(`
`223`		`- r"min\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly`
	`223`	`+ r"f?min(?:nm)?\tz[0-9].[shdb],( p[0-9]/[zm],)? z[0-9].[shdb], z[0-9].[shdb]", assembly`
`224`	`224`	`)`
`225`	`225`
`226`	`226`	`assert len(loads) > 1`