[0035] Align matrix-vector APIs with coopvec (microsoft#741)

llvm-beanz · mjbedy · V-FEXrt · web-flow · commit 65815b02e71d · 2025-12-13T11:40:49.000-06:00
This aligns the matrix-vector HLSL APIs with the SM 6.9 cooperative
vector feature such that the matrix is an `A` matrix and the vectors are
column vectors rather than row vectors. It also aligns the argument
orders between the HLSL and DXIL APIs to make it easier to read (SM 6.9
had a mismatch between HLSL APIs and DXIL).

---------

Co-authored-by: Michael Bedy &lt;mjb@bedy.org&gt;
Co-authored-by: Ashley Coleman &lt;ascoleman@microsoft.com&gt;
diff --git a/proposals/0035-linalg-matrix.md b/proposals/0035-linalg-matrix.md
@@ -190,42 +190,39 @@ Multiply(const Matrix<T, M, K, MatrixUse::A, MatrixScope::ThreadGroup>,
 
 template <typename OutputElTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE K,
           ComponentEnum MatrixDT, MatrixScopeEnum Scope>
-vector<OutputElTy, K> Multiply(vector<InputElTy, M>,
-                               Matrix<MatrixDT, M, K, MatrixUse::B, Scope>);
+vector<OutputElTy, M> Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, Scope>,
+                               vector<InputElTy, K>);
 
 template <typename OutputElTy, typename InputElTy, typename BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT,
           MatrixScopeEnum Scope>
-vector<OutputElTy, K> MultiplyAdd(vector<InputElTy, M>,
-                                  Matrix<MatrixDT, M, K, MatrixUse::B, Scope>,
-                                  vector<BiasElTy, K>);
+vector<OutputElTy, M> MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, Scope>,
+                                  vector<InputElTy, K>, vector<BiasElTy, M>);
 
-template <typename OutputElTy, typename InputElTy,
-          ComponentEnum InputInterp, typename BiasElTy, SIZE_TYPE M,
-          SIZE_TYPE N, SIZE_TYPE K, ComponentEnum MatrixDT,
-          MatrixScopeEnum Scope>
-typename hlsl::enable_if<InterpretedVector<InputElTy, N, InputInterp>::Size ==
+template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
+          typename BiasElTy, SIZE_TYPE M, SIZE_TYPE VecM, SIZE_TYPE K,
+          ComponentEnum MatrixDT, MatrixScopeEnum Scope>
+typename hlsl::enable_if<InterpretedVector<InputElTy, VecM, InputInterp>::Size ==
                              M,
                          vector<OutputElTy, K> >::type
-    MultiplyAdd(InterpretedVector<InputElTy, N, InputInterp>,
-                Matrix<MatrixDT, M, K, MatrixUse::B, Scope>,
+    MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, Scope>,
+                InterpretedVector<InputElTy, VecM, InputInterp>,
                 vector<BiasElTy, K>);
 
 template <typename OutputElTy, typename InputElTy, ComponentEnum BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT>
 vector<OutputElTy, K>
-    MultiplyAdd(vector<InputElTy, M>,
-                Matrix<MatrixDT, M, K, MatrixUse::B, MatrixScope::Thread>,
-                VectorRef<BiasElTy, K>);
+    MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread>,
+                vector<InputElTy, M>, VectorRef<BiasElTy, K>);
 
-template <typename OutputElTy, typename InputElTy,
-          ComponentEnum InputInterp, ComponentEnum BiasElTy,
-          SIZE_TYPE M, SIZE_TYPE N, SIZE_TYPE K, ComponentEnum MatrixDT>
-typename hlsl::enable_if<InterpretedVector<InputElTy, N, InputInterp>::Size ==
+template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
+          ComponentEnum BiasElTy, SIZE_TYPE M, SIZE_TYPE VecM, SIZE_TYPE K,
+          ComponentEnum MatrixDT>
+typename hlsl::enable_if<InterpretedVector<InputElTy, VecM, InputInterp>::Size ==
                              M,
                          vector<OutputElTy, K> >::type
-    MultiplyAdd(InterpretedVector<InputElTy, N, InputInterp>,
-                Matrix<MatrixDT, M, K, MatrixUse::B, MatrixScope::Thread>,
+    MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread>,
+                InterpretedVector<InputElTy, VecM, InputInterp>,
                 VectorRef<BiasElTy, K>);
 
 // Outer product functions
@@ -282,32 +279,30 @@ ByteAddressBuffer B : register(t0);
 
 void CoopVec() {
   using namespace dx::linalg;
-  using MatrixBTy = Matrix<ComponentType::F16, 16, 16, MatrixUse::B,
-                           MatrixScope::Thread>;
+  using MatrixATy =
+      Matrix<ComponentType::F16, 16, 16, MatrixUse::A, MatrixScope::Thread>;
 
   vector<float16_t, 16> Vec = (vector<float16_t, 16>)0;
-  MatrixBTy MatB = MatrixBTy::Load(
+  MatrixATy MatA = MatrixATy::Load(
       MBuf, 0, /* Row stride = number of columns * element size */ 16 * 4,
       MatrixLayout::RowMajor);
-  vector<float16_t, 16> Layer1 = Multiply<float16_t>(Vec, MatB);
+  vector<float16_t, 16> Layer1 = Multiply<float16_t>(MatA, Vec);
 
   vector<float16_t, 16> NullBias = (vector<float16_t, 16>)0;
-  vector<float16_t, 16> Layer2 = MultiplyAdd<float16_t>(Layer1, MatB, NullBias);
+  vector<float16_t, 16> Layer2 = MultiplyAdd<float16_t>(MatA, Layer1, NullBias);
 
   VectorRef<ComponentType::F8_E4M3, 16> MemBias = {MBuf,
-                                                         /*start offset*/ 4096};
-  vector<float16_t, 16> Layer3 = MultiplyAdd<float16_t>(Layer2, MatB, MemBias);
+                                                   /*start offset*/ 4096};
+  vector<float16_t, 16> Layer3 = MultiplyAdd<float16_t>(MatA, Layer2, MemBias);
 
   // Clang doesn't yet support packed types.
 #ifdef __hlsl_dx_compiler
   vector<uint8_t4_packed, 4> SomeData = (vector<uint8_t4_packed, 4>)0;
 
   vector<float16_t, 16> Layer4 = MultiplyAdd<float16_t>(
-      MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), MatB,
-      MemBias);
+      MatA, MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), MemBias);
   vector<float16_t, 16> Layer5 = MultiplyAdd<float16_t>(
-      MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), MatB,
-      NullBias);
+      MatA, MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), NullBias);
 #endif
 }
 ```
@@ -416,7 +411,7 @@ The following table summarizes the operations supported for each matrix scope:
 | `Matrix::SumAccumulate()` | ✗ | ✓ | ✓ |
 | `linalg::Multiply(Matrix, Matrix)` | ✗ | ✓ | ✓ |
 | `linalg::Multiply(vector, Matrix)` | ✓ | ✗ | ✗ |
-| `linalg::MultiplyAdd(vector, Matrix, vector)` | ✓ | ✗ | ✗ |
+| `linalg::MultiplyAdd(Matrix, vector, vector)` | ✓ | ✗ | ✗ |
 | `linalg::OuterProduct(vector, vector)` | ✓ | ✓ | ✓ |
 
 Throughout this document a matrix may be described as having a scope as
@@ -697,7 +692,7 @@ Requires `Wave` or `ThreadGroup` scope matrix.
 
 Returns the number of matrix components accessible to the current thread. If the
 matrix's elements are stored in a packed type, `Length` will return the number of
-packed elements (e.g. if a thread has 8 accessible elements of `int8` type 
+packed elements (e.g. if a thread has 8 accessible elements of `int8` type
 packed into 2 `int8_t4_packed`, `Length` will return 2). The mapping and
 distribution of threads to matrix elements is opaque and
 implementation-specific. The value returned by `Length` may be different for
@@ -928,24 +923,30 @@ infers the type of the output accumulator to match the input vector element type
 the other overload takes a template parameter for the output matrix element type.
 All matrix scopes are allowed for the output matrix.
 
-#### linalg::MultiplyAdd(vector, Matrix, vector)
+#### linalg::MultiplyAdd(Matrix, vector, vector)
 
 ``` c++
 template <typename OutputElTy, typename InputElTy, typename BiasElTy, uint M,
           uint K, ComponentType MatrixDT>
 vector<OutputElTy, K>
-    linalg::MultiplyAdd(vector<InputElTy, M>,
-                        Matrix<MatrixDT, M, K, MatrixUse::B, MatrixScope::Thread>,
+    linalg::MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread>,
+                        vector<InputElTy, M>,
                         vector<BiasElTy, K>);
 ```
 
 Requires `Thread` scope matrix input, may be called from divergent control flow.
 
-The `linalg::MultiplyAdd` function has an overload that takes an `M`-element, an
-MxK `B` matrix with `Thread` scope, and a `K`-element vector. The operation
+The `linalg::MultiplyAdd` function has an overload that takes an MxK `A` matrix
+with `Thread` scope, an `M`-element vector, and a `K`-element vector. The operation
 multiplies the `M`-element vector by the matrix then adds the `K`-element vector
 producing a result `K`-element vector.
 
+Either vector may be a native vector or an `InterpretedVector` which combines a
+packed element vector with an interpretation type. The `K`-element vector may
+also be a `VectorRef` which refers to a vector in memory. Using the `VectorRef`
+overload makes it easier for the backend compiler to optimize the bias vector
+loads with the ALU operations.
+
 ### DXIL Types
 
 This feature adds the following new DXIL enumerations, which used as immediate
@@ -1212,37 +1213,37 @@ Must be called from wave-uniform control flow.
 ``` llvm
 declare <[NUMo] x [TYo]> @dx.op.matvecmul.v[NUMo][TYo].v[NUMi][TYi](
   immarg i32,           ; opcode
+  %dx.types.MatrixRef,  ; matrix A
   <[NUMi] x [TYi]>,     ; input vector
-  immarg i32,           ; input interpretation type (DXILComponentType)
-  %dx.types.MatrixRef   ; matrix A
+  immarg i32            ; input interpretation type (DXILComponentType)
 )
 ```
 
-This operation implements a row-vector multiplication against a `B` matrix of
+This operation implements a row-vector multiplication against an `A` matrix of
 `Thread` scope.
 
 Validation will enforce that:
-* The input vector length matches the `M` matrix dimension
-* The matrix A is a `B` matrix of `Thread` scope
+* The input vector length matches the `K` matrix dimension
+* The matrix A is an `A` matrix of `Thread` scope
 
 ``` llvm
 declare <[NUMo] x [TYo]> @dx.op.matvecmuladd.v[NUMo][TYo].v[NUMi][TYi].v[NUMo][TYb](
   immarg i32,            ; opcode
+  %dx.types.MatrixRef,   ; matrix A
   <[NUMi] x [TYi]>,      ; input vector
   immarg i32,            ; input interpretation type (DXILComponentType)
-  %dx.types.MatrixRef,   ; matrix A
   <[NUMo] x [TYb]>,      ; bias vector
   immarg i32             ; bias interpretation type (DXILComponentType)
 )
 ```
 
-This operation implements a row-vector multiplication against a `B` matrix of
+This operation implements a row-vector multiplication against an `A` matrix of
 `Thread` scope with a bias vector added to the result.
 
 Validation will enforce that:
-* The input vector length matches the `M` matrix dimension
-* The bias vector length matches the `N` matrix dimension
-* The matrix A is a `B` matrix of `Thread` scope
+* The input vector length matches the `K` matrix dimension
+* The bias vector length matches the `M` matrix dimension
+* The matrix A is an `A` matrix of `Thread` scope
 
 ```llvm
 declare void @dx.op.matrixAccumulateToDescriptor(
@@ -1374,7 +1375,7 @@ in the [`DXILComponentType` enumeration](#dxil-enumerations).
 
 ## Appendix 2: HLSL Header
 
-[Compiler Explorer](https://godbolt.org/z/W5a7zbPr3)
+[Compiler Explorer](https://godbolt.org/z/aPWK1KjeE)
 > Note: this mostly works with Clang, but has some issues to work out still.
 
 ```cpp
@@ -1639,41 +1640,39 @@ Multiply(const Matrix<T, M, K, MatrixUse::A, MatrixScope::ThreadGroup>,
 
 template <typename OutputElTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE K,
           ComponentEnum MatrixDT, MatrixScopeEnum Scope>
-vector<OutputElTy, K> Multiply(vector<InputElTy, M>,
-                               Matrix<MatrixDT, M, K, MatrixUse::B, Scope>);
+vector<OutputElTy, M> Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, Scope>,
+                               vector<InputElTy, K>);
 
 template <typename OutputElTy, typename InputElTy, typename BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT,
           MatrixScopeEnum Scope>
-vector<OutputElTy, K> MultiplyAdd(vector<InputElTy, M>,
-                                  Matrix<MatrixDT, M, K, MatrixUse::B, Scope>,
-                                  vector<BiasElTy, K>);
+vector<OutputElTy, M> MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, Scope>,
+                                  vector<InputElTy, K>, vector<BiasElTy, M>);
 
 template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
-          typename BiasElTy, SIZE_TYPE M, SIZE_TYPE N, SIZE_TYPE K,
+          typename BiasElTy, SIZE_TYPE M, SIZE_TYPE VecM, SIZE_TYPE K,
           ComponentEnum MatrixDT, MatrixScopeEnum Scope>
-typename hlsl::enable_if<InterpretedVector<InputElTy, N, InputInterp>::Size ==
+typename hlsl::enable_if<InterpretedVector<InputElTy, VecM, InputInterp>::Size ==
                              M,
                          vector<OutputElTy, K> >::type
-    MultiplyAdd(InterpretedVector<InputElTy, N, InputInterp>,
-                Matrix<MatrixDT, M, K, MatrixUse::B, Scope>,
+    MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, Scope>,
+                InterpretedVector<InputElTy, VecM, InputInterp>,
                 vector<BiasElTy, K>);
 
 template <typename OutputElTy, typename InputElTy, ComponentEnum BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT>
 vector<OutputElTy, K>
-    MultiplyAdd(vector<InputElTy, M>,
-                Matrix<MatrixDT, M, K, MatrixUse::B, MatrixScope::Thread>,
-                VectorRef<BiasElTy, K>);
+    MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread>,
+                vector<InputElTy, M>, VectorRef<BiasElTy, K>);
 
 template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
-          ComponentEnum BiasElTy, SIZE_TYPE M, SIZE_TYPE N, SIZE_TYPE K,
+          ComponentEnum BiasElTy, SIZE_TYPE M, SIZE_TYPE VecM, SIZE_TYPE K,
           ComponentEnum MatrixDT>
-typename hlsl::enable_if<InterpretedVector<InputElTy, N, InputInterp>::Size ==
+typename hlsl::enable_if<InterpretedVector<InputElTy, VecM, InputInterp>::Size ==
                              M,
                          vector<OutputElTy, K> >::type
-    MultiplyAdd(InterpretedVector<InputElTy, N, InputInterp>,
-                Matrix<MatrixDT, M, K, MatrixUse::B, MatrixScope::Thread>,
+    MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread>,
+                InterpretedVector<InputElTy, VecM, InputInterp>,
                 VectorRef<BiasElTy, K>);
 
 // Outer product functions
@@ -1722,30 +1721,30 @@ ByteAddressBuffer MBuf : register(t0);
 
 void CoopVec() {
   using namespace dx::linalg;
-  using MatrixBTy =
-      Matrix<ComponentType::F16, 16, 16, MatrixUse::B, MatrixScope::Thread>;
+  using MatrixATy =
+      Matrix<ComponentType::F16, 16, 16, MatrixUse::A, MatrixScope::Thread>;
 
   vector<float16_t, 16> Vec = (vector<float16_t, 16>)0;
-  MatrixBTy MatB = MatrixBTy::Load(
+  MatrixATy MatA = MatrixATy::Load(
       MBuf, 0, /* Row stride = number of columns * element size */ 16 * 4,
       MatrixLayout::RowMajor);
-  vector<float16_t, 16> Layer1 = Multiply<float16_t>(Vec, MatB);
+  vector<float16_t, 16> Layer1 = Multiply<float16_t>(MatA, Vec);
 
   vector<float16_t, 16> NullBias = (vector<float16_t, 16>)0;
-  vector<float16_t, 16> Layer2 = MultiplyAdd<float16_t>(Layer1, MatB, NullBias);
+  vector<float16_t, 16> Layer2 = MultiplyAdd<float16_t>(MatA, Layer1, NullBias);
 
   VectorRef<ComponentType::F8_E4M3, 16> MemBias = {MBuf,
                                                    /*start offset*/ 4096};
-  vector<float16_t, 16> Layer3 = MultiplyAdd<float16_t>(Layer2, MatB, MemBias);
+  vector<float16_t, 16> Layer3 = MultiplyAdd<float16_t>(MatA, Layer2, MemBias);
 
   // Clang doesn't yet support packed types.
 #ifdef __hlsl_dx_compiler
   vector<uint8_t4_packed, 4> SomeData = (vector<uint8_t4_packed, 4>)0;
 
   vector<float16_t, 16> Layer4 = MultiplyAdd<float16_t>(
-      MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), MatB, MemBias);
+      MatA, MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), MemBias);
   vector<float16_t, 16> Layer5 = MultiplyAdd<float16_t>(
-      MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), MatB, NullBias);
+      MatA, MakeInterpretedVector<ComponentType::F8_E4M3>(SomeData), NullBias);
 #endif
 }