@@ -591,42 +591,17 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
591
591
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
592
592
}
593
593
594
- static __device__ __forceinline__ void ggml_cuda_mad (half2 & acc, const half2 v, const half2 u) {
595
- #ifdef FAST_FP16_AVAILABLE
596
- acc += v*u;
597
- #else
598
- const float2 tmpv = __half22float2 (v);
599
- const float2 tmpu = __half22float2 (u);
600
- float2 tmpacc = __half22float2 (acc);
601
- tmpacc.x += tmpv.x * tmpu.x ;
602
- tmpacc.y += tmpv.y * tmpu.y ;
603
- acc = make_half2 (tmpacc.x , tmpacc.y );
604
- #endif // FAST_FP16_AVAILABLE
605
- }
606
-
607
594
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
608
- template <int nbytes, int alignment = 0 >
595
+ template <int nbytes>
609
596
static __device__ __forceinline__ void ggml_cuda_memcpy_1 (void * __restrict__ dst, const void * __restrict__ src) {
610
- if constexpr (alignment != 0 ) {
611
- static_assert (nbytes % alignment == 0 , " bad alignment" );
612
- }
613
- constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
614
-
615
- #pragma unroll
616
- for (int i = 0 ; i < nbytes/nb_per_cpy; ++i) {
617
- if constexpr (nb_per_cpy == 1 ) {
618
- ((char *) dst)[i] = ((const char *) src)[i];
619
- } else if constexpr (nb_per_cpy == 2 ) {
620
- ((short *) dst)[i] = ((const short *) src)[i];
621
- } else if constexpr (nb_per_cpy == 4 ) {
622
- ((int *) dst)[i] = ((const int *) src)[i];
623
- } else if constexpr (nb_per_cpy == 8 ) {
624
- ((int2 *) dst)[i] = ((const int2 *) src)[i];
625
- } else if constexpr (nb_per_cpy == 16 ) {
626
- ((int4 *) dst)[i] = ((const int4 *) src)[i];
627
- } else {
628
- static_assert (nbytes == 0 && nbytes == -1 , " bad nbytes" );
629
- }
597
+ if constexpr (nbytes == 4 ) {
598
+ *(int *) dst = *(const int *) src;
599
+ } else if constexpr (nbytes == 8 ) {
600
+ *(int2 *) dst = *(const int2 *) src;
601
+ } else if constexpr (nbytes == 16 ) {
602
+ *(int4 *) dst = *(const int4 *) src;
603
+ } else {
604
+ static_assert (nbytes == 0 && nbytes == -1 , " bad nbytes" );
630
605
}
631
606
}
632
607
0 commit comments