intel
diff --git a/‎.pre-commit-config.yaml
+6 b/‎.pre-commit-config.yaml
+6
diff --git a/‎bench/triton_bench/matmul_ogs_details/_finalize_scatter.py
-150 b/‎bench/triton_bench/matmul_ogs_details/_finalize_scatter.py
-150
diff --git a/‎bench/triton_bench/matmul_ogs_details/_finalize_split_k.py
-38 b/‎bench/triton_bench/matmul_ogs_details/_finalize_split_k.py
-38
diff --git a/‎bin/RegisterTritonDialects.h
+1-1 b/‎bin/RegisterTritonDialects.h
+1-1
diff --git a/‎include/triton/Analysis/Allocation.h
-4 b/‎include/triton/Analysis/Allocation.h
-4
diff --git a/‎include/triton/Analysis/AxisInfo.h
-4 b/‎include/triton/Analysis/AxisInfo.h
-4
diff --git a/‎include/triton/Analysis/Membar.h
-1 b/‎include/triton/Analysis/Membar.h
-1
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
+43 b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
+43
diff --git a/‎include/triton/Dialect/Triton/IR/TritonDialect.td
+2-1 b/‎include/triton/Dialect/Triton/IR/TritonDialect.td
+2-1
diff --git a/‎include/triton/Dialect/Triton/Transforms/Passes.h
+4-8 b/‎include/triton/Dialect/Triton/Transforms/Passes.h
+4-8
diff --git a/‎include/triton/Dialect/Triton/Transforms/Passes.td
+3-7 b/‎include/triton/Dialect/Triton/Transforms/Passes.td
+3-7
@@ -40,6 +40,12 @@ repos:
     hooks:
       - id: clang-format
 
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.15.0"
+    hooks:
+      - id: mypy
+        pass_filenames: false
+
   # Expand YAML anchors in files used by github workflows, because github can't
   # do this itself.  This lets us use anchors, which avoids code duplication.
   - repo: local
 
@@ -59,7 +59,7 @@ void registerTestTritonAMDGPURangeAnalysis();
 
 inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerAllPasses();
-  mlir::registerTritonPasses();
+  mlir::triton::registerTritonPasses();
   mlir::triton::gpu::registerTritonGPUPasses();
   mlir::registerTritonNvidiaGPUPasses();
   mlir::test::intel::registerTestAxisInfoPass();
 
@@ -7,10 +7,6 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
-#include <atomic>
 #include <limits>
 
 namespace mlir {
 
@@ -6,12 +6,8 @@
 
 #include "mlir/Support/LLVM.h"
 #include "triton/Analysis/Utility.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 #include <optional>
-#include <type_traits>
 
 namespace mlir::triton {
 
 
@@ -2,7 +2,6 @@
 #define TRITON_ANALYSIS_MEMBAR_H
 
 #include "Allocation.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 #include <set>
 
 
@@ -729,6 +729,49 @@ SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
 
 Value packLLVector(Location loc, ValueRange vals, RewriterBase &rewriter);
 
+inline std::optional<LLVM::AtomicBinOp> matchAtomicOp(RMWOp atomicOp) {
+  switch (atomicOp) {
+  case RMWOp::AND:
+    return LLVM::AtomicBinOp::_and;
+  case RMWOp::OR:
+    return LLVM::AtomicBinOp::_or;
+  case RMWOp::XOR:
+    return LLVM::AtomicBinOp::_xor;
+  case RMWOp::ADD:
+    return LLVM::AtomicBinOp::add;
+  case RMWOp::FADD:
+    return LLVM::AtomicBinOp::fadd;
+  case RMWOp::MAX:
+    return LLVM::AtomicBinOp::max;
+  case RMWOp::MIN:
+    return LLVM::AtomicBinOp::min;
+  case RMWOp::UMAX:
+    return LLVM::AtomicBinOp::umax;
+  case RMWOp::UMIN:
+    return LLVM::AtomicBinOp::umin;
+  case RMWOp::XCHG:
+    return LLVM::AtomicBinOp::xchg;
+  default:
+    return {};
+  }
+}
+
+inline std::optional<LLVM::AtomicOrdering>
+getMemoryOrdering(MemSemantic memOrdering) {
+  switch (memOrdering) {
+  case MemSemantic::RELAXED:
+    return LLVM::AtomicOrdering::monotonic;
+  case MemSemantic::ACQUIRE:
+    return LLVM::AtomicOrdering::acquire;
+  case MemSemantic::RELEASE:
+    return LLVM::AtomicOrdering::release;
+  case MemSemantic::ACQUIRE_RELEASE:
+    return LLVM::AtomicOrdering::acq_rel;
+  default:
+    return {};
+  }
+}
+
 inline bool
 isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
                            ArrayRef<int64_t> allocShape,
 
@@ -45,7 +45,8 @@ def Triton_Dialect : Dialect {
 
   let discardableAttrs = (ins
      "::mlir::IntegerAttr":$num_stages,
-     "::mlir::IntegerAttr":$latency
+     "::mlir::IntegerAttr":$latency,
+     "::mlir::IntegerAttr":$self_latency
   );
 
   let hasConstantMaterializer = 1;
 
@@ -6,18 +6,14 @@
 namespace mlir {
 namespace triton {
 
-std::unique_ptr<Pass> createCombineOpsPass();
-
-std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
-std::unique_ptr<Pass> createReorderBroadcastPass();
-std::unique_ptr<Pass> createRewriteTensorPointerPass();
-std::unique_ptr<Pass> createLoopUnrollPass();
-
-} // namespace triton
+// Generate the pass class declarations.
+#define GEN_PASS_DECL
+#include "triton/Dialect/Triton/Transforms/Passes.h.inc"
 
 #define GEN_PASS_REGISTRATION
 #include "triton/Dialect/Triton/Transforms/Passes.h.inc"
 
+} // namespace triton
 } // namespace mlir
 
 #endif
@@ -19,8 +19,6 @@ def TritonCombineOps : Pass</*cli-arg*/"triton-combine", /*Op*/"mlir::ModuleOp">
        => dot(x,y,splat(0))`
   }];
 
-  let constructor = "mlir::triton::createCombineOpsPass()";
-
   let dependentDialects = ["mlir::arith::ArithDialect"];
 }
 
@@ -33,7 +31,7 @@ def TritonReorderBroadcast : Pass</*cli-arg*/"triton-reorder-broadcast", /*Op*/"
     In the event of a match, the broadcast (or splat) operation is delayed
     and performed after the ElementWise operation.
   }];
-  let constructor = "mlir::triton::createReorderBroadcastPass()";
+
   let dependentDialects = ["mlir::triton::TritonDialect"];
 }
 
@@ -45,8 +43,6 @@ def TritonRewriteTensorPointer : Pass</*cli-arg*/"triton-rewrite-tensor-pointer"
     the pointer/mask/other for each load/store.
   }];
 
-  let constructor = "mlir::triton::createRewriteTensorPointerPass()";
-
   let dependentDialects = ["mlir::triton::TritonDialect"];
 }
 
@@ -56,7 +52,7 @@ def TritonLoopUnroll : Pass</*cli-arg*/"triton-loop-unroll", /*Op*/"mlir::Module
     The pass unrolls a scf loop with tt.loop_unroll_factor attribute. The attribute specialises how many iterations
     the loop should be unrolled.
   }];
-  let constructor = "mlir::triton::createLoopUnrollPass()";
+
   let dependentDialects = ["mlir::triton::TritonDialect"];
 }
 
@@ -68,7 +64,7 @@ def TritonLoopInvariantCodeMotion : Pass</*cli-arg*/"triton-licm", /*Op*/"mlir::
     generates a trip-count check. For scf.while loops, it clones the condition
     from the before body.
   }];
-  let constructor = "mlir::triton::createLoopInvariantCodeMotionPass()";
+
   let dependentDialects = ["mlir::triton::TritonDialect"];
 }