|
| 1 | +#include <ATen/ATen.h> |
| 2 | + |
| 3 | +#include <ATen/NativeFunctions.h> |
| 4 | +#include <ATen/Parallel.h> |
| 5 | +#include <ATen/native/ReduceOpsUtils.h> |
| 6 | +#include <ATen/native/cpu/utils.h> |
| 7 | +#include <ATen/record_function.h> |
| 8 | +#include <c10/util/irange.h> |
| 9 | + |
| 10 | +#include "SelectiveScan.h" |
| 11 | +#include "utils/library.h" |
| 12 | + |
| 13 | +namespace torch_ipex { |
| 14 | +namespace cpu { |
| 15 | + |
| 16 | +IPEX_DEFINE_DISPATCH(selective_scan_kernel_stub); |
| 17 | +IPEX_DEFINE_DISPATCH(selective_state_update_kernel_stub); |
| 18 | + |
| 19 | +/** |
| 20 | + * Does selective scan algorithm in Mamba Paper. |
| 21 | + * Paper: https://arxiv.org/abs/2312.00752 |
| 22 | + * Official Python Implementation: |
| 23 | + * selective_scan_ref: |
| 24 | + * https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py#L113 |
| 25 | + * @param u: (batch, dim, len) or (batch, len, dim) |
| 26 | + * @param delta: same shape as u |
| 27 | + * @param A: (dim, dstate) or (dstate, dim) |
| 28 | + * @param B: (batch, dstate, len) or (batch, dstate, 2len) or (battch, ngroups, |
| 29 | + * dstate, len) |
| 30 | + * @param C: (batch, dstate, len) or (batch, dstate, 2len) or (battch, ngroups, |
| 31 | + * dstate, len) |
| 32 | + * @param D: (dim,) or None |
| 33 | + * @param z: (batch, dim, len) or None |
| 34 | + * @param delta_bias: (dim,) or None |
| 35 | + * @param delta_softplus: bool |
| 36 | + * @param return_last_state: bool |
| 37 | + * @return: out: (batch, dim, len), last_state: (batch, dim, dstate) |
| 38 | + */ |
| 39 | +std::tuple<at::Tensor, at::Tensor> selective_scan( |
| 40 | + const at::Tensor& u, |
| 41 | + const at::Tensor& delta, |
| 42 | + const at::Tensor& A, |
| 43 | + const at::Tensor& B, |
| 44 | + const at::Tensor& C, |
| 45 | + const c10::optional<at::Tensor>& D, |
| 46 | + const c10::optional<at::Tensor>& z, |
| 47 | + const c10::optional<at::Tensor>& delta_bias, |
| 48 | + bool delta_softplus, |
| 49 | + bool return_last_state) { |
| 50 | + RECORD_FUNCTION("selective_scan_fn", c10::ArrayRef<c10::IValue>({})); |
| 51 | + return selective_scan_kernel_stub( |
| 52 | + kCPU, |
| 53 | + u, |
| 54 | + delta, |
| 55 | + A, |
| 56 | + B, |
| 57 | + C, |
| 58 | + D, |
| 59 | + z, |
| 60 | + delta_bias, |
| 61 | + delta_softplus, |
| 62 | + return_last_state); |
| 63 | +} |
| 64 | + |
| 65 | +/** |
| 66 | + * Official Python Implementation: |
| 67 | + * selective_state_update_ref: |
| 68 | + * https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py#L219 |
| 69 | + * @param state: (batch, dim, dstate) or (batch, nheads, dim, dstate) |
| 70 | + * @param x: (batch, dim) or (batch, nheads, dim) |
| 71 | + * @param dt: (batch, dim) or (batch, nheads, dim) |
| 72 | + * @param A: (dim, dstate) or (nheads, dim, dstate) or (dstate, dim) or (nheads, |
| 73 | + * dstate, dim) |
| 74 | + * @param B: (batch, dstate) or (batch, ngroups, dstate) |
| 75 | + * @param C: (batch, dstate) or (batch, ngroups, dstate) |
| 76 | + * @param D: (dim,) or (nheads, dim) or None |
| 77 | + * @param z: (batch, dim) or (batch, nheads, dim) or None |
| 78 | + * @param dt_bias: (dim,) or (nheads, dim) or None |
| 79 | + * @param dt_softplus: bool |
| 80 | + * @return: out: (batch, dim) or (batch, nheads, dim) |
| 81 | + */ |
| 82 | +at::Tensor selective_state_update( |
| 83 | + const at::Tensor& state, |
| 84 | + const at::Tensor& x, |
| 85 | + const at::Tensor& dt, |
| 86 | + const at::Tensor& A, |
| 87 | + const at::Tensor& B, |
| 88 | + const at::Tensor& C, |
| 89 | + const c10::optional<at::Tensor>& D, |
| 90 | + const c10::optional<at::Tensor>& z, |
| 91 | + const c10::optional<at::Tensor>& dt_bias, |
| 92 | + bool dt_softplus) { |
| 93 | + RECORD_FUNCTION("selective_state_update", c10::ArrayRef<c10::IValue>({})); |
| 94 | + return selective_state_update_kernel_stub( |
| 95 | + kCPU, state, x, dt, A, B, C, D, z, dt_bias, dt_softplus); |
| 96 | +} |
| 97 | + |
| 98 | +} // namespace cpu |
| 99 | +} // namespace torch_ipex |
| 100 | + |
| 101 | +namespace { |
| 102 | + |
| 103 | +IPEX_TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { |
| 104 | + m.def( |
| 105 | + "selective_scan_fn(Tensor u, Tensor delta, Tensor A, Tensor B, Tensor C, Tensor? D, Tensor? z, Tensor? delta_bias, bool delta_softplus, bool return_last_state) -> (Tensor, Tensor)"); |
| 106 | + m.impl( |
| 107 | + "selective_scan_fn", |
| 108 | + c10::DispatchKey::CPU, |
| 109 | + torch_ipex::cpu::selective_scan); |
| 110 | + m.def( |
| 111 | + "selective_state_update(Tensor state, Tensor x, Tensor dt, Tensor A, Tensor B, Tensor C, Tensor? D, Tensor? z, Tensor? dt_bias, bool dt_softplus) -> (Tensor)"); |
| 112 | + m.impl( |
| 113 | + "selective_state_update", |
| 114 | + c10::DispatchKey::CPU, |
| 115 | + torch_ipex::cpu::selective_state_update); |
| 116 | +} |
| 117 | + |
| 118 | +} // namespace |
0 commit comments