intel
diff --git a/‎csrc/cpu/aten/Linear.cpp‎
Lines changed: 72 additions & 14 deletions b/‎csrc/cpu/aten/Linear.cpp‎
Lines changed: 72 additions & 14 deletions
diff --git a/‎csrc/cpu/aten/Linear.h‎
Lines changed: 19 additions & 4 deletions b/‎csrc/cpu/aten/Linear.h‎
Lines changed: 19 additions & 4 deletions
@@ -370,8 +370,11 @@ at::Tensor woq_linear_pack_weight(
   // For TPP kernel, we only consider even K
   if (K % 2 == 0) {
     size_t block_n = 32;
-    size_t default_block_k = (weight_dtype == WOQ_DTYPE_INT4 && lowp_mode == 3) ? 128 : 64;
-    size_t block_k = group_size > 0 ? std::min((size_t)group_size, default_block_k) : default_block_k;
+    size_t default_block_k =
+        (weight_dtype == WOQ_DTYPE_INT4 && lowp_mode == 3) ? 128 : 64;
+    size_t block_k = group_size > 0
+        ? std::min((size_t)group_size, default_block_k)
+        : default_block_k;
     while (K % block_k != 0) {
       block_k /= 2;
     }
@@ -418,6 +421,7 @@ at::Tensor woq_linear_unpack_weight(
   return woq_tpp_gemm_unpackB_stub(kCPU, weight, weight_dtype, lowp_mode);
 }
 
+#define PAD_M_THRESHOLD 32
 IPEX_DEFINE_DISPATCH(woq_tpp_gemm_kernel_stub);
 at::Tensor woq_linear_kernel(
     const at::Tensor& self,
@@ -428,11 +432,22 @@ at::Tensor woq_linear_kernel(
     const std::vector<at::Tensor>& bias_list,
     int64_t group_size,
     int64_t lowp_mode,
-    int64_t act_quant_mode) {
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation) {
   int64_t quant_w_mode = group_size > 0 ? 1 : 0;
-  return woq_tpp_gemm_kernel_stub(
+  auto K = self.size(-1);
+  auto M = self.numel() / K;
+  auto in = self;
+  // Big performance regression is found with some odd M
+  // So, we pad M to the nearest even number
+  bool m_padded = false;
+  if (M >= PAD_M_THRESHOLD && M % 2 == 1) {
+    in = at::pad(in.view({M, K}), {0, 0, 0, 1}, "constant", 0);
+    m_padded = true;
+  }
+  auto y = woq_tpp_gemm_kernel_stub(
       kCPU,
-      self,
+      in,
       weight,
       scales_list,
       zps_list,
@@ -443,7 +458,14 @@ at::Tensor woq_linear_kernel(
       std::vector<at::Tensor>(),
       act_quant_mode,
       quant_w_mode,
-      group_size);
+      group_size,
+      compensation);
+  if (m_padded) {
+    auto out_size = self.sizes().vec();
+    out_size.back() = y.size(-1);
+    y = y.narrow(0, 0, M).view(out_size);
+  }
+  return y;
 }
 
 at::Tensor woq_linear_forward(
@@ -466,7 +488,8 @@ at::Tensor woq_linear_unary_kernel(
     const c10::optional<c10::string_view>& algorithm,
     int64_t group_size,
     int64_t lowp_mode,
-    int64_t act_quant_mode) {
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation) {
   int64_t post_op_fusion_type = WOQ_FUSE_NONE;
   if (post_op == "gelu") {
     if (algorithm == "none") {
@@ -480,9 +503,19 @@ at::Tensor woq_linear_unary_kernel(
     post_op_fusion_type = WOQ_FUSE_SILU;
   }
   int64_t quant_w_mode = group_size > 0 ? 1 : 0;
-  return woq_tpp_gemm_kernel_stub(
+  auto K = self.size(-1);
+  auto M = self.numel() / K;
+  auto in = self;
+  // Big performance regression is found with some odd M
+  // So, we pad M to the nearest even number
+  bool m_padded = false;
+  if (M >= PAD_M_THRESHOLD && M % 2 == 1) {
+    in = at::pad(in.view({M, K}), {0, 0, 0, 1}, "constant", 0);
+    m_padded = true;
+  }
+  auto y = woq_tpp_gemm_kernel_stub(
       kCPU,
-      self,
+      in,
       weight,
       scales_list,
       zps_list,
@@ -493,7 +526,14 @@ at::Tensor woq_linear_unary_kernel(
       std::vector<at::Tensor>(),
       act_quant_mode,
       quant_w_mode,
-      group_size);
+      group_size,
+      compensation);
+  if (m_padded) {
+    auto out_size = self.sizes().vec();
+    out_size.back() = y.size(-1);
+    y = y.narrow(0, 0, M).view(out_size);
+  }
+  return y;
 }
 
 at::Tensor woq_linear_gelu_forward(
@@ -541,7 +581,8 @@ at::Tensor woq_linear_binary_kernel(
     int64_t lowp_mode,
     const c10::string_view& post_op,
     const std::vector<at::Tensor>& others,
-    int64_t act_quant_mode) {
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation) {
   int64_t post_op_fusion_type = WOQ_FUSE_NONE;
   if (post_op == "add") {
     post_op_fusion_type = WOQ_FUSE_ADD;
@@ -551,9 +592,19 @@ at::Tensor woq_linear_binary_kernel(
     post_op_fusion_type = WOQ_FUSE_MUL;
   }
   int64_t quant_w_mode = group_size > 0 ? 1 : 0;
-  return woq_tpp_gemm_kernel_stub(
+  auto K = self.size(-1);
+  auto M = self.numel() / K;
+  auto in = self;
+  // Big performance regression is found with some odd M
+  // So, we pad M to the nearest even number
+  bool m_padded = false;
+  if (M >= PAD_M_THRESHOLD && M % 2 == 1) {
+    in = at::pad(in.view({M, K}), {0, 0, 0, 1}, "constant", 0);
+    m_padded = true;
+  }
+  auto y = woq_tpp_gemm_kernel_stub(
       kCPU,
-      self,
+      in,
       weight,
       scales_list,
       zps_list,
@@ -564,7 +615,14 @@ at::Tensor woq_linear_binary_kernel(
       others,
       act_quant_mode,
       quant_w_mode,
-      group_size);
+      group_size,
+      compensation);
+  if (m_padded) {
+    auto out_size = self.sizes().vec();
+    out_size.back() = y.size(-1);
+    y = y.narrow(0, 0, M).view(out_size);
+  }
+  return y;
 }
 
 at::Tensor woq_linear_add_forward(
 
@@ -136,7 +136,8 @@ at::Tensor woq_linear_kernel(
     const std::vector<at::Tensor>& bias_list,
     int64_t group_size,
     int64_t lowp_mode,
-    int64_t act_quant_mode);
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation = c10::nullopt);
 
 at::Tensor woq_linear_unary_kernel(
     const at::Tensor& self,
@@ -150,7 +151,8 @@ at::Tensor woq_linear_unary_kernel(
     const c10::optional<c10::string_view>& algorithm,
     int64_t group_size,
     int64_t lowp_mode,
-    int64_t act_quant_mode);
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation = c10::nullopt);
 
 at::Tensor woq_linear_binary_kernel(
     const at::Tensor& self,
@@ -163,7 +165,8 @@ at::Tensor woq_linear_binary_kernel(
     int64_t lowp_mode,
     const c10::string_view& post_op,
     const std::vector<at::Tensor>& others,
-    int64_t act_quant_mode);
+    int64_t act_quant_mode,
+    const c10::optional<at::Tensor>& compensation = c10::nullopt);
 
 namespace {
 void woq_gemm_kernel_impl(
@@ -239,16 +242,28 @@ using woq_tpp_gemm_kernel_fn = at::Tensor (*)(
     const std::vector<at::Tensor>&,
     int64_t,
     int64_t,
-    int64_t);
+    int64_t,
+    const c10::optional<at::Tensor>&);
 
 using woq_tpp_gemm_packB_fn =
     at::Tensor (*)(const at::Tensor&, int, size_t, size_t, int64_t);
 
 using woq_tpp_gemm_unpackB_fn = at::Tensor (*)(const at::Tensor&, int, int64_t);
 
+using woq_dequant_int4_to_int8_packed_fn = at::Tensor (*)(
+    const at::Tensor&,
+    const at::Tensor&,
+    const at::Tensor&,
+    int,
+    int64_t,
+    at::Tensor&);
+
 IPEX_DECLARE_DISPATCH(woq_tpp_gemm_kernel_fn, woq_tpp_gemm_kernel_stub);
 IPEX_DECLARE_DISPATCH(woq_tpp_gemm_packB_fn, woq_tpp_gemm_packB_stub);
 IPEX_DECLARE_DISPATCH(woq_tpp_gemm_unpackB_fn, woq_tpp_gemm_unpackB_stub);
+IPEX_DECLARE_DISPATCH(
+    woq_dequant_int4_to_int8_packed_fn,
+    woq_dequant_int4_to_int8_packed_stub);
 
 // Fusion types
 #define WOQ_FUSE_NONE 0x0