vllm-project
diff --git a/‎cacheflow/master/server.py‎
Lines changed: 2 additions & 2 deletions b/‎cacheflow/master/server.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cacheflow/models/utils.py‎
Lines changed: 1 addition & 0 deletions b/‎cacheflow/models/utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/activation_kernels.cu‎
Lines changed: 3 additions & 1 deletion b/‎csrc/activation_kernels.cu‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎csrc/attention/attention_dtypes.cuh‎ renamed to ‎csrc/attention/attention_dtypes.h‎
Lines changed: 4 additions & 0 deletions b/‎csrc/attention/attention_dtypes.cuh‎ renamed to ‎csrc/attention/attention_dtypes.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎csrc/attention/attention_kernels.cu‎
Lines changed: 6 additions & 2 deletions b/‎csrc/attention/attention_kernels.cu‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎csrc/attention/attention_utils.cuh‎
Lines changed: 1 addition & 1 deletion b/‎csrc/attention/attention_utils.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -213,8 +213,8 @@ def add_server_arguments(parser: argparse.ArgumentParser):
     parser.add_argument('--use-np-cache', action='store_true',
                         help='save a numpy copy of model weights for faster loading')
     parser.add_argument('--use-dummy-weights', action='store_true', help='use dummy values for model weights')
-    # NOTE(woosuk): If FlashAttention is used, the float data type is not supported.
-    parser.add_argument('--dtype', type=str, default='half', choices=['half'], help='data type')
+    # NOTE(woosuk): FlashAttention does not support float32.
+    parser.add_argument('--dtype', type=str, default='half', choices=['half', 'bfloat16'], help='data type')
     # Parallel arguments
     parser.add_argument('--use-ray', action='store_true', help='use Ray for distributed serving, will be automatically set when using more than 1 GPU')
     parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=1, help='number of pipeline stages')
 
@@ -17,6 +17,7 @@
     'float': torch.float,
     'float16': torch.float16,
     'float32': torch.float32,
+    'bfloat16': torch.bfloat16,
 }
 
 
 
@@ -34,7 +34,9 @@ void silu_and_mul(
   dim3 grid(num_tokens);
   dim3 block(std::min(d, 1024));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
     input.scalar_type(),
     "silu_and_mul_kernel",
     [&] {
 
@@ -3,3 +3,7 @@
 #include "attention_generic.cuh"
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
+
+#ifdef ENABLE_BF16
+#include "dtype_bfloat16.cuh"
+#endif // ENABLE_BF16
@@ -1,7 +1,7 @@
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#include "attention_dtypes.cuh"
+#include "attention_dtypes.h"
 #include "attention_utils.cuh"
 
 #include <algorithm>
@@ -438,9 +438,13 @@ void single_query_cached_kv_attention(
   torch::Tensor& context_lens,    // [num_seqs]
   int block_size,
   int max_context_len) {
-  // TODO(woosuk): Support FP32 and BF16.
+  // TODO(woosuk): Support FP32.
   if (query.dtype() == at::ScalarType::Half) {
     CALL_KERNEL_LAUNCHER_BLOCK_SIZE(uint16_t);
+#ifdef ENABLE_BF16
+  } else if (query.dtype() == at::ScalarType::BFloat16) {
+    CALL_KERNEL_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
+#endif
   } else {
     TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
   }
 
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "attention_dtypes.cuh"
+#include "attention_dtypes.h"
 
 #include <float.h>
 #include <type_traits>
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`'float': torch.float,`
`18`	`18`	`'float16': torch.float16,`
`19`	`19`	`'float32': torch.float32,`
	`20`	`+ 'bfloat16': torch.bfloat16,`
`20`	`21`	`}`
`21`	`22`
`22`	`23`