|
248 | 248 | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
249 | 249 | #endif
|
250 | 250 |
|
| 251 | +#if !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
| 252 | +#ifdef GGML_FP16_TO_FP32 |
| 253 | +#undef GGML_FP16_TO_FP32 |
| 254 | +#endif |
| 255 | + |
| 256 | +#ifdef GGML_FP32_TO_FP16 |
| 257 | +#undef GGML_FP32_TO_FP16 |
| 258 | +#endif |
| 259 | + |
| 260 | +#ifdef GGML_COMPUTE_FP16_TO_FP32 |
| 261 | +#undef GGML_COMPUTE_FP16_TO_FP32 |
| 262 | +#endif |
| 263 | + |
| 264 | +#ifdef GGML_COMPUTE_FP32_TO_FP16 |
| 265 | +#undef GGML_COMPUTE_FP32_TO_FP16 |
| 266 | +#endif |
| 267 | + |
| 268 | +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
| 269 | +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
| 270 | + |
| 271 | +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
| 272 | + |
| 273 | +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 274 | + __fp16 tmp; |
| 275 | + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); |
| 276 | + return (float)tmp; |
| 277 | +} |
| 278 | + |
| 279 | +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { |
| 280 | + ggml_fp16_t res; |
| 281 | + __fp16 tmp = f; |
| 282 | + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); |
| 283 | + return res; |
| 284 | +} |
| 285 | + |
| 286 | +#endif // !__CUDACC__ && __CUDACC_VER_MAJOR__ <= 11 && !__MUSACC__ |
| 287 | + |
251 | 288 | #elif defined(__AVX512F__)
|
252 | 289 |
|
253 | 290 | #define GGML_SIMD
|
@@ -410,6 +447,23 @@ do { \
|
410 | 447 | // the _mm256_cvt intrinsics require F16C
|
411 | 448 | #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
412 | 449 | #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
| 450 | + |
| 451 | +#ifdef GGML_COMPUTE_FP16_TO_FP32 |
| 452 | +#undef GGML_COMPUTE_FP16_TO_FP32 |
| 453 | +#endif |
| 454 | + |
| 455 | +#ifdef GGML_COMPUTE_FP32_TO_FP16 |
| 456 | +#undef GGML_COMPUTE_FP32_TO_FP16 |
| 457 | +#endif |
| 458 | + |
| 459 | +#ifdef _MSC_VER |
| 460 | + #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) |
| 461 | + #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) |
| 462 | +#else |
| 463 | + #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) |
| 464 | + #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) |
| 465 | +#endif |
| 466 | + |
413 | 467 | #else
|
414 | 468 | static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
415 | 469 | float tmp[8];
|
@@ -519,6 +573,53 @@ static inline unsigned char ggml_endian_byte(int i) {
|
519 | 573 | r[i - GGML_ENDIAN_BYTE(0)]), \
|
520 | 574 | 0, p - GGML_F16_EPR)
|
521 | 575 |
|
| 576 | +#ifdef GGML_FP16_TO_FP32 |
| 577 | +#undef GGML_FP16_TO_FP32 |
| 578 | +#endif |
| 579 | + |
| 580 | +#ifdef GGML_FP32_TO_FP16 |
| 581 | +#undef GGML_FP32_TO_FP16 |
| 582 | +#endif |
| 583 | + |
| 584 | +#ifdef GGML_COMPUTE_FP16_TO_FP32 |
| 585 | +#undef GGML_COMPUTE_FP16_TO_FP32 |
| 586 | +#endif |
| 587 | + |
| 588 | +#ifdef GGML_COMPUTE_FP32_TO_FP16 |
| 589 | +#undef GGML_COMPUTE_FP32_TO_FP16 |
| 590 | +#endif |
| 591 | + |
| 592 | +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
| 593 | +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
| 594 | +/* the inline asm below is about 12% faster than the lookup method */ |
| 595 | +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) |
| 596 | +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) |
| 597 | + |
| 598 | +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 599 | + float f; |
| 600 | + double d; |
| 601 | + __asm__( |
| 602 | + "mtfprd %0,%2\n" |
| 603 | + "xscvhpdp %0,%0\n" |
| 604 | + "frsp %1,%0\n" : |
| 605 | + /* temp */ "=d"(d), |
| 606 | + /* out */ "=f"(f): |
| 607 | + /* in */ "r"(h)); |
| 608 | + return f; |
| 609 | +} |
| 610 | + |
| 611 | +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { |
| 612 | + double d; |
| 613 | + ggml_fp16_t r; |
| 614 | + __asm__( /* xscvdphp can work on double or single precision */ |
| 615 | + "xscvdphp %0,%2\n" |
| 616 | + "mffprd %1,%0\n" : |
| 617 | + /* temp */ "=d"(d), |
| 618 | + /* out */ "=r"(r): |
| 619 | + /* in */ "f"(f)); |
| 620 | + return r; |
| 621 | +} |
| 622 | + |
522 | 623 | #elif defined(__wasm_simd128__)
|
523 | 624 |
|
524 | 625 | #define GGML_SIMD
|
@@ -1052,6 +1153,35 @@ static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
1052 | 1153 |
|
1053 | 1154 | #endif // __NNPA__
|
1054 | 1155 |
|
| 1156 | +#elif defined(__riscv) && defined(__riscv_zfhmin) |
| 1157 | + |
| 1158 | +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 1159 | + float f; |
| 1160 | + __asm__( |
| 1161 | + "fmv.h.x %[f], %[h]\n\t" |
| 1162 | + "fcvt.s.h %[f], %[f]" |
| 1163 | + : [f] "=&f" (f) |
| 1164 | + : [h] "r" (h) |
| 1165 | + ); |
| 1166 | + return f; |
| 1167 | +} |
| 1168 | + |
| 1169 | +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { |
| 1170 | + ggml_fp16_t res; |
| 1171 | + __asm__( |
| 1172 | + "fcvt.h.s %[f], %[f]\n\t" |
| 1173 | + "fmv.x.h %[h], %[f]" |
| 1174 | + : [h] "=&r" (res) |
| 1175 | + : [f] "f" (f) |
| 1176 | + ); |
| 1177 | + return res; |
| 1178 | +} |
| 1179 | + |
| 1180 | +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
| 1181 | +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
| 1182 | +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) |
| 1183 | +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) |
| 1184 | + |
1055 | 1185 | #endif
|
1056 | 1186 |
|
1057 | 1187 | // GGML_F32_ARR / GGML_F16_ARR
|
|
0 commit comments