NVIDIA
diff --git a/‎include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
Lines changed: 70 additions & 1 deletion b/‎include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
Lines changed: 70 additions & 1 deletion
diff --git a/‎include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
Lines changed: 10 additions & 4 deletions b/‎include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
Lines changed: 10 additions & 4 deletions
diff --git a/‎include/cutlass/numeric_conversion.h
Lines changed: 80 additions & 0 deletions b/‎include/cutlass/numeric_conversion.h
Lines changed: 80 additions & 0 deletions
diff --git a/‎python/cutlass_library/generator.py
Lines changed: 147 additions & 0 deletions b/‎python/cutlass_library/generator.py
Lines changed: 147 additions & 0 deletions
@@ -268,7 +268,7 @@ struct DefaultMmaTensorOp<
     "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
 
   // Data type used for internal computation - use the wider of the two data types for mma.sync operands
-  using ElementOperand = typename platform::conditional<(sizeof(ElementA) > sizeof(ElementB)), 
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
                                                     ElementA, ElementB>::type;
 
   // Operand datatypes in the internal MMA instruction - use the wider of the two data types
@@ -294,6 +294,75 @@ struct DefaultMmaTensorOp<
       Policy, PartitionsK, AccumulatorsInRowMajor>;
 };
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. S32 <= S4 x S8 + S32, S32 <= S8 x S4 + S32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 32>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory 
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value, 
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value), 
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using MmaElementA = ElementOperand;
+  using MmaElementB = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses 
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 32>, 
+        32, 
+        MmaElementA, cutlass::layout::RowMajor, 
+        MmaElementB, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAddSaturate
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace warp
 
@@ -104,6 +104,7 @@ struct FragmentShuffler {
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
 /// for operand A multiplicand going through upcasting. 
 template <
   /// Element type for the operand in registers for the mma.sync
@@ -122,8 +123,10 @@ struct FragmentShuffler <ElementMma_, ElementLoad_,
                          NumElementsInWarpFragment, 
                          NumElementsInMmaFragment,
                          Operand::kA,
-                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value == 16) &&
-                                                 (sizeof_bits<ElementLoad_>::value == 8)>::type> {
+                         typename platform::enable_if<((sizeof_bits<ElementMma_>::value == 16) &&
+                                                       (sizeof_bits<ElementLoad_>::value == 8)) ||
+                                                      ((sizeof_bits<ElementMma_>::value == 8) &&
+                                                       (sizeof_bits<ElementLoad_>::value == 4))>::type> {
 public:
   using ElementMma = ElementMma_;
   using ElementLoad = ElementLoad_;
@@ -187,6 +190,7 @@ struct FragmentShuffler <ElementMma_, ElementLoad_,
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
 /// for operand B multiplicand going through upcasting. 
 template <
   /// Element type for the operand in registers for the mma.sync
@@ -205,8 +209,10 @@ struct FragmentShuffler <ElementMma_, ElementLoad_,
                          NumElementsInWarpFragment, 
                          NumElementsInMmaFragment,
                          Operand::kB,
-                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value == 16) &&
-                                                 (sizeof_bits<ElementLoad_>::value == 8)>::type> {
+                         typename platform::enable_if<((sizeof_bits<ElementMma_>::value == 16) &&
+                                                       (sizeof_bits<ElementLoad_>::value == 8)) ||
+                                                      ((sizeof_bits<ElementMma_>::value == 8) &&
+                                                       (sizeof_bits<ElementLoad_>::value == 4))>::type> {
 public:
   using ElementMma = ElementMma_;
   using ElementLoad = ElementLoad_;
 
@@ -2771,6 +2771,86 @@ struct NumericArrayConverter<uint4b_t, int, N, Round> {
   }
 };
 
+/// Partial specialization for Array<int8_t, 8> <= Array<int4b_t, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int4b_t, 8, Round> {
+
+  using result_type = Array<int8_t, 8>;
+  using source_type = Array<int4b_t, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned const& storage = reinterpret_cast<unsigned const &>(source);
+    unsigned out[2];
+
+    asm volatile(
+        "{ .reg .u32 tmp0, tmp1, tmp2;"
+        "shl.b32 tmp0, %2, 4;"
+        "and.b32 tmp0, tmp0, 0xf0f0f0f0;"
+        "prmt.b32 tmp1, tmp0, tmp0, 0xba98;"
+        "and.b32 tmp1, tmp1, 0xf0f0f0f0;"
+        "shr.u32 tmp0, tmp0, 4;"
+        "or.b32 tmp2, tmp0, tmp1;"
+        "and.b32 tmp0, %2, 0xf0f0f0f0;"
+        "prmt.b32 tmp1, tmp0, tmp0, 0xba98;"
+        "and.b32 tmp1, tmp1, 0xf0f0f0f0;"
+        "shr.u32 tmp0, tmp0, 4;"
+        "or.b32 tmp0, tmp0, tmp1;"
+        "prmt.b32 %0, tmp2, tmp0, 0x5140;"
+        "prmt.b32 %1, tmp2, tmp0, 0x7362;"
+        "}"
+        : "=r"(out[0]), "=r"(out[1])
+        : "r"(storage));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int4b_t>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int4b_t, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<int4b_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int8_t, int4b_t, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int8_t, 8> *result_ptr = reinterpret_cast<Array<int8_t, 8> *>(&result);
+    Array<int4b_t, 8> const *source_ptr = reinterpret_cast<Array<int4b_t, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
 #endif  // Conditional guards to enable partial specialization for packed integers
 
 namespace detail {
 
@@ -2854,6 +2854,151 @@ def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
       else:
         op.C.alignment = 8
 
+#
+def GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_a(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  # Upcast on Operand A
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s4, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the 
+  # inner list contains the alignment constraints for operands/matrices 
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[32, 16, 4],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. S8 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      alignment_constraints = [[32, 16, 16],]
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_b,
+        math_inst.element_accumulator,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8) 
+
+#
+def GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_b(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  # Upcast on Operand B
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the 
+  # inner list contains the alignment constraints for operands/matrices 
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[16, 32, 4],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. S8 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      alignment_constraints = [[16, 32, 16],]
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
 #
 
 #
@@ -4699,6 +4844,8 @@ def GenerateSM80(manifest, cuda_version):
   GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version)
   GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version)
   GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_a(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_b(manifest, cuda_version)
   GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
   GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
   GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)