Added fp8 gemm gelu_aux_bias support

Veera Gopu · Veera Gopu · commit 091ab04c79fb · 2025-04-30T09:47:51.000Z
diff --git a/ci/core.sh b/ci/core.sh
@@ -30,6 +30,8 @@ for _gemm in hipblaslt rocblas; do
     fi
     echo  ===== Run GEMM $_gemm tests =====
     ctest --test-dir build -j4 -R "OperatorTest/GEMMTestSuite" $_exclude
+    # fp8 GELU_AUX_BIAS tests
+    ctest --test-dir build -j4 -R "OperatorTest/GEMMTestSuite.Testfp8xfp8xfp16xfp16xfp8/.*"
     test $? -eq 0 || test_run_error
 done
 
diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -76,14 +76,18 @@ void compute_ref(
   size_t m, size_t k, size_t n,
   D_Type* ref_d_data,
   float* ref_d_amax,
-  Gelu_Type* ref_gelu_data){
+  Gelu_Type* ref_gelu_data,
+  bool transa,
+  bool transb){
 
   *ref_d_amax = 0;
   for(size_t ii = 0; ii < m; ii++){
     for(size_t jj = 0; jj < n; jj++){
       float val = 0;
       for(size_t kk = 0; kk < k; kk++){
-        val += a_scale_inv*b_scale_inv*((float)a_data[ii + kk*m])*((float)b_data[kk + jj*k]);
+        float a_val = transa ? (float)a_data[kk + ii*k] : (float)a_data[ii + kk*m];
+        float b_val = transb ? (float)b_data[jj + kk*n] : (float)b_data[kk + jj*k];
+        val += a_scale_inv*b_scale_inv*a_val*b_val;
       }
       if(bias_data){
         val += (float)bias_data[ii];
@@ -103,16 +107,24 @@ void compute_ref(
 }
 
 template <typename A_Type, typename B_Type, typename Bias_Type, typename Gelu_Type, typename D_Type>
-void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, const size_t n) {
+void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, const size_t n, char transa_char = 'N', char transb_char = 'N') {
   DType atype = TypeInfo<A_Type>::dtype;
   DType btype = TypeInfo<B_Type>::dtype;
   DType bias_type = TypeInfo<Bias_Type>::dtype;
   DType gelu_type = TypeInfo<Gelu_Type>::dtype;
   DType dtype = TypeInfo<D_Type>::dtype;
-
+  bool transa = (transa_char == 'T' || transa_char == 't');
+  bool transb = (transb_char == 'T' || transb_char == 't');
+  
   // pytorch tensor storage is row-major while cublas/rocblas is column-major
   Tensor A({ k, m }, atype);
+  if (transa){
+    A = Tensor({ m, k }, atype);
+  }
   Tensor B({ n, k }, btype);
+  if (transb){
+    B = Tensor({ k, n }, atype);
+  }
   Tensor D({ n, m }, dtype);
   Tensor bias;
   if(use_bias){
@@ -133,8 +145,7 @@ void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, c
   if(isFp8Type(dtype)){
     setRandomScale(&D);
   }
-  bool transa = false;
-  bool transb = false;
+
   bool grad = false;
   bool accumulate = false;
 
@@ -189,7 +200,9 @@ void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, c
     m, k, n,
     ref_D.get(),
     &ref_amax_d,
-    use_gelu? ref_pre_gelu_out.get(): nullptr);
+    use_gelu? ref_pre_gelu_out.get(): nullptr,
+    transa,
+    transb);
   // check if error message happens in running                             
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -221,7 +234,28 @@ void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, c
 using fp32=float;
 using fp8=fp8e4m3;
 using bf8=fp8e5m2;
- 
+
+TEST_P(GEMMTestSuite, Testfp8xfp8xfp16xfp16xfp8) {
+  using namespace transformer_engine;
+  using namespace test;
+
+  const size_t m = std::get<0>(std::get<0>(GetParam()));
+  const size_t k = std::get<1>(std::get<0>(GetParam()));
+  const size_t n = std::get<2>(std::get<0>(GetParam()));
+  const bool use_bias = std::get<1>(GetParam());
+  const bool use_gelu = std::get<2>(GetParam());
+  char transa_char = 'T';
+  char transb_char = 'N';
+
+  using A_Type = fp8;
+  using B_Type = fp8;
+  using Bias_Type = fp16;
+  using Gelu_Type = fp16;
+  using D_Type = fp8;
+
+  performTest<A_Type, B_Type, Bias_Type, Gelu_Type, D_Type>(use_bias, use_gelu, m, k, n, transa_char, transb_char);
+}
+
 TEST_P(GEMMTestSuite, Testfp32xfp32xfp32xfp32xfp32) {
   using namespace transformer_engine;
   using namespace test;
diff --git a/transformer_engine/common/gemm/rocm_gemm.cu b/transformer_engine/common/gemm/rocm_gemm.cu
@@ -1009,6 +1009,8 @@ void hipblaslt_gemm(const Tensor *inputA,
   void *B = inputB->data.dptr;
   void *B_scale_inverse = inputB->scale_inv.dptr;
   void *D = outputD->data.dptr;
+  void *D_amax = outputD->amax.dptr;
+  void *D_scale = outputD->scale.dptr;
   void *bias_ptr = inputBias->data.dptr;
   const bool bias = bias_ptr != nullptr;
   void *pre_gelu_out = outputPreGelu->data.dptr;
@@ -1028,8 +1030,16 @@ void hipblaslt_gemm(const Tensor *inputA,
   // check consistency of arguments:
   // if fp8 is desired, context cannot be null
   // fp8 + gelu fusion + fp8 aux is unavailable right now.
-  if (use_fp8) {
-    NVTE_CHECK(!gelu, "fp8 gemm + gelu fusion is unavailable right now!");
+  const hipblasltDatatype_t aux_type = get_hipblaslt_dtype(outputPreGelu->data.dtype);
+  bool allow_fp8_gemm = (A_type == HIPBLASLT_R_8F_E4M3) &&
+                        (B_type == HIPBLASLT_R_8F_E4M3) &&
+                        (D_type == HIPBLASLT_R_8F_E4M3) &&
+                        (bias_type == HIPBLASLT_R_16F) &&
+                        (aux_type == HIPBLASLT_R_16F);
+  if(!allow_fp8_gemm){              
+    if (use_fp8) {
+      NVTE_CHECK(!gelu, "fp8 gemm + gelu fusion is unavailable right now!");
+    }
   }
   float one = 1.0;
   float zero = 0.0;
@@ -1091,11 +1101,28 @@ void hipblaslt_gemm(const Tensor *inputA,
                                                      HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER,
                                                      &B_scale_inverse,
                                                      sizeof(B_scale_inverse)));
+    if (is_fp8_dtype(outputD->data.dtype)) {
+      NVTE_CHECK_HIPBLASLT(hipblasLtMatmulDescSetAttribute(operationDesc,
+                                                        HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER,
+                                                        &D_amax,
+                                                        sizeof(D_amax)));
+
+      NVTE_CHECK_HIPBLASLT(hipblasLtMatmulDescSetAttribute(operationDesc,
+                                                        HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER ,
+                                                        &D_scale,
+                                                        sizeof(D_scale)));
+    }
     if (bias) {
       NVTE_CHECK_HIPBLASLT(hipblasLtMatmulDescSetAttribute(operationDesc,
                                                        HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE,
                                                        &bias_type, sizeof(bias_type)));
     }
+    if (gelu){
+      NVTE_CHECK_HIPBLASLT(hipblasLtMatmulDescSetAttribute(operationDesc,
+                                                        HIPBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE,
+                                                        &aux_type,
+                                                        sizeof(aux_type)));
+    }
   }
 
   if (bias && gelu) {