Skip to content

Commit 5f32bcf

Browse files
authored
[mlir][sparse][gpu] re-enable all GPU libgen tests (#72185)
Previous change no longer properly used the GPU libgen pass (even though most tests still passed falling back to CPU). This revision puts the proper pass order into place. Also bit of a cleanup of CPU codegen vs. libgen setup.
1 parent 57dd23b commit 5f32bcf

File tree

15 files changed

+85
-78
lines changed

15 files changed

+85
-78
lines changed

mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,7 @@ struct SparseCompilerOptions
144144

145145
/// Projects out the options for `createSparsificationPass`.
146146
SparsificationOptions sparsificationOptions() const {
147-
return SparsificationOptions(parallelization, enableGPULibgen,
148-
enableRuntimeLibrary);
147+
return SparsificationOptions(parallelization, enableRuntimeLibrary);
149148
}
150149

151150
/// Projects out the options for `createConvertVectorToLLVMPass`.

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h

+6-9
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,11 @@ std::unique_ptr<Pass> createPreSparsificationRewritePass();
7474

7575
/// Options for the Sparsification pass.
7676
struct SparsificationOptions {
77-
SparsificationOptions(SparseParallelizationStrategy p, bool gpuLibgen,
78-
bool enableRT)
79-
: parallelizationStrategy(p), enableGPULibgen(gpuLibgen),
80-
enableRuntimeLibrary(enableRT) {}
77+
SparsificationOptions(SparseParallelizationStrategy p, bool enableRT)
78+
: parallelizationStrategy(p), enableRuntimeLibrary(enableRT) {}
8179
SparsificationOptions()
82-
: SparsificationOptions(SparseParallelizationStrategy::kNone, false,
83-
true) {}
80+
: SparsificationOptions(SparseParallelizationStrategy::kNone, true) {}
8481
SparseParallelizationStrategy parallelizationStrategy;
85-
bool enableGPULibgen;
8682
bool enableRuntimeLibrary;
8783
};
8884

@@ -196,7 +192,8 @@ void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
196192
bool enableRT);
197193

198194
std::unique_ptr<Pass> createSparseGPUCodegenPass();
199-
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);
195+
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads,
196+
bool enableRT);
200197

201198
//===----------------------------------------------------------------------===//
202199
// The SparseStorageSpecifierToLLVM pass.
@@ -225,7 +222,7 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
225222
const SparsificationOptions &sparsificationOptions,
226223
bool createSparseDeallocs, bool enableRuntimeLibrary,
227224
bool enableBufferInitialization, unsigned vectorLength,
228-
bool enableVLAVectorization, bool enableSIMDIndex32);
225+
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen);
229226

230227
//===----------------------------------------------------------------------===//
231228
// Registration.

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td

+6-6
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
105105
"affine::AffineDialect",
106106
"arith::ArithDialect",
107107
"bufferization::BufferizationDialect",
108-
"gpu::GPUDialect",
109108
"LLVM::LLVMDialect",
110109
"linalg::LinalgDialect",
111110
"memref::MemRefDialect",
@@ -131,9 +130,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
131130
clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
132131
"any-storage-any-loop",
133132
"Enable sparse parallelization for any storage and loop."))}]>,
134-
Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
135-
"false",
136-
"Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
137133
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
138134
"true", "Enable runtime library for manipulating sparse tensors">,
139135
];
@@ -368,7 +364,9 @@ def SparseVectorization : Pass<"sparse-vectorization", "ModuleOp"> {
368364
def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
369365
let summary = "Generates GPU code during sparsification";
370366
let description = [{
371-
Enables the sparsifier to use GPU acceleration.
367+
Enables the sparsifier to use GPU acceleration. When the number of GPU
368+
threads is set to zero, the pass tries to enable GPU acceleration by
369+
means of direct library calls (like cuSPARSE).
372370
}];
373371
let constructor = "mlir::createSparseGPUCodegenPass()";
374372
let dependentDialects = [
@@ -381,7 +379,9 @@ def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
381379
"sparse_tensor::SparseTensorDialect",
382380
];
383381
let options = [
384-
Option<"numThreads", "num_threads", "int32_t", "1024", "Sets the number of GPU threads">,
382+
Option<"numThreads", "num-threads", "int32_t", "1024", "Sets the number of GPU threads">,
383+
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
384+
"true", "Enable runtime library for manipulating sparse tensors">,
385385
];
386386
}
387387

mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp

+10-1
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,25 @@
3131

3232
void mlir::sparse_tensor::buildSparseCompiler(
3333
OpPassManager &pm, const SparseCompilerOptions &options) {
34+
// Rewrite named linalg ops into generic ops.
3435
pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());
36+
37+
// Sparsification and bufferization mini-pipeline.
3538
pm.addPass(createSparsificationAndBufferizationPass(
3639
getBufferizationOptionsForSparsification(
3740
options.testBufferizationAnalysisOnly),
3841
options.sparsificationOptions(), options.createSparseDeallocs,
3942
options.enableRuntimeLibrary, options.enableBufferInitialization,
4043
options.vectorLength,
4144
/*enableVLAVectorization=*/options.armSVE,
42-
/*enableSIMDIndex32=*/options.force32BitVectorIndices));
45+
/*enableSIMDIndex32=*/options.force32BitVectorIndices,
46+
options.enableGPULibgen));
47+
48+
// Bail-early for test setup.
4349
if (options.testBufferizationAnalysisOnly)
4450
return;
4551

52+
// Storage specifier lowering and bufferization wrap-up.
4653
pm.addPass(createStorageSpecifierToLLVMPass());
4754
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
4855
pm.addNestedPass<func::FuncOp>(
@@ -72,8 +79,10 @@ void mlir::sparse_tensor::buildSparseCompiler(
7279
pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
7380
pm.addPass(createConvertMathToLibmPass());
7481
pm.addPass(createConvertComplexToLibmPass());
82+
7583
// Repeat convert-vector-to-llvm.
7684
pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
85+
7786
pm.addPass(createConvertComplexToLLVMPass());
7887
pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
7988
pm.addPass(createConvertFuncToLLVMPass());

mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp

+13-10
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,15 @@ struct SparsificationPass
8282
SparsificationPass(const SparsificationPass &pass) = default;
8383
SparsificationPass(const SparsificationOptions &options) {
8484
parallelization = options.parallelizationStrategy;
85-
enableGPULibgen = options.enableGPULibgen;
8685
enableRuntimeLibrary = options.enableRuntimeLibrary;
8786
}
8887

8988
void runOnOperation() override {
9089
auto *ctx = &getContext();
9190
// Translate strategy flags to strategy options.
92-
SparsificationOptions options(parallelization, enableGPULibgen,
93-
enableRuntimeLibrary);
94-
// Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
91+
SparsificationOptions options(parallelization, enableRuntimeLibrary);
92+
// Apply sparsification and cleanup rewriting.
9593
RewritePatternSet patterns(ctx);
96-
if (enableGPULibgen)
97-
populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
9894
populateSparsificationPatterns(patterns, options);
9995
scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
10096
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
@@ -323,12 +319,18 @@ struct SparseGPUCodegenPass
323319
: public impl::SparseGPUCodegenBase<SparseGPUCodegenPass> {
324320
SparseGPUCodegenPass() = default;
325321
SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default;
326-
SparseGPUCodegenPass(unsigned nT) { numThreads = nT; }
322+
SparseGPUCodegenPass(unsigned nT, bool enableRT) {
323+
numThreads = nT;
324+
enableRuntimeLibrary = enableRT;
325+
}
327326

328327
void runOnOperation() override {
329328
auto *ctx = &getContext();
330329
RewritePatternSet patterns(ctx);
331-
populateSparseGPUCodegenPatterns(patterns, numThreads);
330+
if (numThreads == 0)
331+
populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
332+
else
333+
populateSparseGPUCodegenPatterns(patterns, numThreads);
332334
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
333335
}
334336
};
@@ -457,8 +459,9 @@ std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass() {
457459
return std::make_unique<SparseGPUCodegenPass>();
458460
}
459461

460-
std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads) {
461-
return std::make_unique<SparseGPUCodegenPass>(numThreads);
462+
std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads,
463+
bool enableRT) {
464+
return std::make_unique<SparseGPUCodegenPass>(numThreads, enableRT);
462465
}
463466

464467
std::unique_ptr<Pass> mlir::createStorageSpecifierToLLVMPass() {

mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,16 @@ class SparsificationAndBufferizationPass
6565
const SparsificationOptions &sparsificationOptions,
6666
bool createSparseDeallocs, bool enableRuntimeLibrary,
6767
bool enableBufferInitialization, unsigned vectorLength,
68-
bool enableVLAVectorization, bool enableSIMDIndex32)
68+
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen)
6969
: bufferizationOptions(bufferizationOptions),
7070
sparsificationOptions(sparsificationOptions),
7171
createSparseDeallocs(createSparseDeallocs),
7272
enableRuntimeLibrary(enableRuntimeLibrary),
7373
enableBufferInitialization(enableBufferInitialization),
7474
vectorLength(vectorLength),
7575
enableVLAVectorization(enableVLAVectorization),
76-
enableSIMDIndex32(enableSIMDIndex32) {}
76+
enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) {
77+
}
7778

7879
/// Bufferize all dense ops. This assumes that no further analysis is needed
7980
/// and that all required buffer copies were already inserted by
@@ -139,6 +140,8 @@ class SparsificationAndBufferizationPass
139140
// of `bufferization.alloc_tensor` ops.
140141
{
141142
OpPassManager pm("builtin.module");
143+
if (enableGPULibgen)
144+
pm.addPass(createSparseGPUCodegenPass(0, enableRuntimeLibrary));
142145
pm.addPass(createSparseReinterpretMapPass(ReinterpretMapScope::kAll));
143146
pm.addPass(createSparsificationPass(sparsificationOptions));
144147
pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
@@ -177,6 +180,7 @@ class SparsificationAndBufferizationPass
177180
unsigned vectorLength;
178181
bool enableVLAVectorization;
179182
bool enableSIMDIndex32;
183+
bool enableGPULibgen;
180184
};
181185

182186
} // namespace sparse_tensor
@@ -210,18 +214,19 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass() {
210214
/*enableBufferInitialization=*/false,
211215
/*vectorLength=*/0,
212216
/*enableVLAVectorization=*/false,
213-
/*enableSIMDIndex32=*/false);
217+
/*enableSIMDIndex32=*/false,
218+
/*enableGPULibgen=*/false);
214219
}
215220

216221
std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
217222
const bufferization::OneShotBufferizationOptions &bufferizationOptions,
218223
const SparsificationOptions &sparsificationOptions,
219224
bool createSparseDeallocs, bool enableRuntimeLibrary,
220225
bool enableBufferInitialization, unsigned vectorLength,
221-
bool enableVLAVectorization, bool enableSIMDIndex32) {
226+
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) {
222227
return std::make_unique<
223228
mlir::sparse_tensor::SparsificationAndBufferizationPass>(
224229
bufferizationOptions, sparsificationOptions, createSparseDeallocs,
225230
enableRuntimeLibrary, enableBufferInitialization, vectorLength,
226-
enableVLAVectorization, enableSIMDIndex32);
231+
enableVLAVectorization, enableSIMDIndex32, enableGPULibgen);
227232
}

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
// RUN: mlir-opt %s --linalg-generalize-named-ops \
2-
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
1+
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
32

43
#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
54

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
// RUN: mlir-opt %s --linalg-generalize-named-ops \
2-
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
1+
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
32

43
// CHECK-LABEL: func.func @matmul(
54
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf16>,

mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
// RUN: mlir-opt %s --linalg-generalize-named-ops \
2-
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
1+
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
32

43
#SortedCOO = #sparse_tensor.encoding<{
54
map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)

mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
1+
// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s
22

33
#trait_sampled_dense_dense = {
44
indexing_maps = [

mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
1+
// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s
22

33
#BSR = #sparse_tensor.encoding<{
44
map = (i, j) -> (

mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
// RUN: mlir-opt %s --linalg-generalize-named-ops \
2-
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
1+
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
32

43
#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
54

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir

100755100644
File mode changed.

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir

100755100644
File mode changed.

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir

+33-35
Original file line numberDiff line numberDiff line change
@@ -85,32 +85,30 @@ module {
8585
// A kernel that computes a BSR sampled dense matrix matrix multiplication
8686
// using a "spy" function and in-place update of the sampling sparse matrix.
8787
//
88-
// TODO: re-enable the following test.
89-
//
90-
// func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
91-
// %arga: tensor<?x?xf32>,
92-
// %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
93-
// %result = linalg.generic #trait_SDDMM
94-
// ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
95-
// outs(%args: tensor<?x?xf32, #BSR>) {
96-
// ^bb(%a: f32, %b: f32, %s: f32):
97-
// %f0 = arith.constant 0.0 : f32
98-
// %u = sparse_tensor.unary %s : f32 to f32
99-
// present={
100-
// ^bb0(%p: f32):
101-
// %mul = arith.mulf %a, %b : f32
102-
// sparse_tensor.yield %mul : f32
103-
// }
104-
// absent={}
105-
// %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
106-
// ^bb0(%p: f32, %q: f32):
107-
// %add = arith.addf %p, %q : f32
108-
// sparse_tensor.yield %add : f32
109-
// }
110-
// linalg.yield %r : f32
111-
// } -> tensor<?x?xf32, #BSR>
112-
// return %result : tensor<?x?xf32, #BSR>
113-
// }
88+
func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
89+
%arga: tensor<?x?xf32>,
90+
%argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
91+
%result = linalg.generic #trait_SDDMM
92+
ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
93+
outs(%args: tensor<?x?xf32, #BSR>) {
94+
^bb(%a: f32, %b: f32, %s: f32):
95+
%f0 = arith.constant 0.0 : f32
96+
%u = sparse_tensor.unary %s : f32 to f32
97+
present={
98+
^bb0(%p: f32):
99+
%mul = arith.mulf %a, %b : f32
100+
sparse_tensor.yield %mul : f32
101+
}
102+
absent={}
103+
%r = sparse_tensor.reduce %s, %u, %f0 : f32 {
104+
^bb0(%p: f32, %q: f32):
105+
%add = arith.addf %p, %q : f32
106+
sparse_tensor.yield %add : f32
107+
}
108+
linalg.yield %r : f32
109+
} -> tensor<?x?xf32, #BSR>
110+
return %result : tensor<?x?xf32, #BSR>
111+
}
114112

115113
func.func private @getTensorFilename(index) -> (!Filename)
116114

@@ -153,15 +151,15 @@ module {
153151
//
154152
%fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
155153
%m_csr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR>
156-
// %m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
154+
%m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
157155

158156
// Call the kernel.
159157
%0 = call @SDDMM(%m_csr, %a, %b)
160158
: (tensor<?x?xf32, #CSR>,
161159
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
162-
// %1 = call @SDDMM_block(%m_bsr, %a, %b)
163-
// : (tensor<?x?xf32, #BSR>,
164-
// tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>
160+
%1 = call @SDDMM_block(%m_bsr, %a, %b)
161+
: (tensor<?x?xf32, #BSR>,
162+
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>
165163

166164
//
167165
// Print the result for verification. Note that the "spy" determines what
@@ -170,18 +168,18 @@ module {
170168
// in the original zero positions).
171169
//
172170
// CHECK: ( 5, 10, 24, 19, 53, 42, 55, 56 )
173-
// C_HECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
171+
// CHECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
174172
//
175173
%v0 = sparse_tensor.values %0 : tensor<?x?xf32, #CSR> to memref<?xf32>
176174
%vv0 = vector.transfer_read %v0[%c0], %d0 : memref<?xf32>, vector<8xf32>
177175
vector.print %vv0 : vector<8xf32>
178-
// %v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
179-
// %vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
180-
// vector.print %vv1 : vector<12xf32>
176+
%v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
177+
%vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
178+
vector.print %vv1 : vector<12xf32>
181179

182180
// Release the resources.
183181
bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
184-
// bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>
182+
bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>
185183

186184
llvm.call @mgpuDestroySparseEnv() : () -> ()
187185
return

0 commit comments

Comments
 (0)