[SM 6.9] Backport vector dot for 6.8 linkage (microsoft#7809)

V-FEXrt · hekota · web-flow · commit b3e84a8fa06c · 2025-10-09T16:36:30.000-06:00
Fixes microsoft#7794 Vector dot in SM6.9 libraries will be converted to scalar dot when linked against SM6.8 shader --------- Co-authored-by: Helena Kotas <hekotas@microsoft.com>
diff --git a/lib/HLSL/DxilScalarizeVectorIntrinsics.cpp b/lib/HLSL/DxilScalarizeVectorIntrinsics.cpp
@@ -14,6 +14,7 @@
 #include "dxc/DXIL/DxilModule.h"
 #include "dxc/HLSL/DxilGenerationPass.h"
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -33,6 +34,7 @@ static bool scalarizeVectorStore(hlsl::OP *HlslOP, const DataLayout &DL,
                                  CallInst *CI);
 static bool scalarizeVectorIntrinsic(hlsl::OP *HlslOP, CallInst *CI);
 static bool scalarizeVectorReduce(hlsl::OP *HlslOP, CallInst *CI);
+static bool scalarizeVectorDot(hlsl::OP *HlslOP, CallInst *CI);
 static bool scalarizeVectorWaveMatch(hlsl::OP *HlslOP, CallInst *CI);
 
 class DxilScalarizeVectorIntrinsics : public ModulePass {
@@ -66,6 +68,7 @@ class DxilScalarizeVectorIntrinsics : public ModulePass {
            OpClass == DXIL::OpCodeClass::RawBufferVectorLoad ||
            OpClass == DXIL::OpCodeClass::RawBufferVectorStore ||
            OpClass == DXIL::OpCodeClass::VectorReduce ||
+           OpClass == DXIL::OpCodeClass::Dot ||
            OpClass == DXIL::OpCodeClass::WaveMatch);
       if (!CouldRewrite)
         continue;
@@ -84,6 +87,9 @@ class DxilScalarizeVectorIntrinsics : public ModulePass {
         case DXIL::OpCodeClass::VectorReduce:
           Changed |= scalarizeVectorReduce(HlslOP, CI);
           continue;
+        case DXIL::OpCodeClass::Dot:
+          Changed |= scalarizeVectorDot(HlslOP, CI);
+          continue;
         case DXIL::OpCodeClass::WaveMatch:
           Changed |= scalarizeVectorWaveMatch(HlslOP, CI);
           continue;
@@ -337,6 +343,66 @@ static bool scalarizeVectorWaveMatch(hlsl::OP *HlslOP, CallInst *CI) {
   return true;
 }
 
+// Scalarize vectorized dot product
+static bool scalarizeVectorDot(hlsl::OP *HlslOP, CallInst *CI) {
+  IRBuilder<> Builder(CI);
+
+  Value *AVecArg = CI->getArgOperand(1);
+  Value *BVecArg = CI->getArgOperand(2);
+  VectorType *VecTy = cast<VectorType>(AVecArg->getType());
+  Type *ScalarTy = VecTy->getScalarType();
+  const unsigned VecSize = VecTy->getNumElements();
+
+  // The only valid opcode is FDot which only has floating point overload.
+  // If we hit this assert then this functions lowering needs to be updated
+  assert(ScalarTy->isFloatingPointTy() && "Unexpected scalar type");
+
+  SmallVector<Value *, 4> AElts(VecSize);
+  SmallVector<Value *, 4> BElts(VecSize);
+
+  for (unsigned EltIdx = 0; EltIdx < VecSize; EltIdx++) {
+    AElts[EltIdx] = Builder.CreateExtractElement(AVecArg, EltIdx);
+    BElts[EltIdx] = Builder.CreateExtractElement(BVecArg, EltIdx);
+  }
+
+  DXIL::OpCode DotOp = DXIL::OpCode::Dot4;
+  switch (VecSize) {
+  // Calling dot on a vec1 is not typical but also not impossible
+  // DXIL doesn't have a native Dot1 opcode but thats the same as a
+  // single FMul. HLOperation lower is expected to do the conversion
+  // so we assert here in case that ever changes.
+  case 1:
+    assert(false && "vector dot shouldn't appear for vec1");
+    break;
+  case 2:
+    DotOp = DXIL::OpCode::Dot2;
+    break;
+  case 3:
+    DotOp = DXIL::OpCode::Dot3;
+    break;
+  case 4:
+    DotOp = DXIL::OpCode::Dot4;
+    break;
+  default:
+    assert(false &&
+           "Vectors larger than 4 components are not supported in SM6.8");
+    break;
+  }
+
+  SmallVector<Value *, 9> Args(VecSize * 2 + 1);
+  Args[0] = Builder.getInt32((unsigned)DotOp);
+
+  for (unsigned EltIdx = 0; EltIdx < VecSize; EltIdx++) {
+    Args[EltIdx + 1] = AElts[EltIdx];
+    Args[EltIdx + 1 + VecSize] = BElts[EltIdx];
+  }
+
+  Function *Func = HlslOP->GetOpFunc(DotOp, ScalarTy);
+  Value *Dot = Builder.CreateCall(Func, Args, CI->getName());
+  CI->replaceAllUsesWith(Dot);
+  return true;
+}
+
 // Scalarize native vector operation represented by `CI`, generating
 // scalar calls for each element of the its vector parameters.
 // Use `HlslOP` to retrieve the associated scalar op function.
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
@@ -2606,8 +2606,10 @@ Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Type *EltTy = Ty->getScalarType();
 
   // SM6.9 introduced a DXIL operation for vectorized dot product
+  // The operation is only advantageous for vect size>1, vec1s will be
+  // lowered to a single Mul.
   if (hlslOP->GetModule()->GetHLModule().GetShaderModel()->IsSM69Plus() &&
-      EltTy->isFloatingPointTy()) {
+      EltTy->isFloatingPointTy() && Ty->getVectorNumElements() > 1) {
     Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
     IRBuilder<> Builder(CI);
     Constant *opArg = hlslOP->GetU32Const((unsigned)DXIL::OpCode::FDot);
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics-sm68.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics-sm68.hlsl
@@ -146,11 +146,27 @@ float4 main(uint i : SV_PrimitiveID, uint4 m : M) : SV_Target {
   // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
   res += pow(vec1, vec2);
 
-  // CHECK: mul i32
-  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
-  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
-  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
-  res += dot(ivec1, ivec2);
+  vector<float, 2> fDot2L = rbuf.Load< vector<float, 2> >(i++*32);
+  vector<float, 2> fDot2R = rbuf.Load< vector<float, 2> >(i++*32);
+  vector<float, 3> fDot3L = rbuf.Load< vector<float, 3> >(i++*32);
+  vector<float, 3> fDot3R = rbuf.Load< vector<float, 3> >(i++*32);
+  vector<float, 4> fDot4L = rbuf.Load< vector<float, 4> >(i++*32);
+  vector<float, 4> fDot4R = rbuf.Load< vector<float, 4> >(i++*32);
+  vector<float, 4> fDotRes = 0;
+
+  // CHECK: fmul fast float %{{.*}}, %{{.*}}
+  fDotRes[0] = dot(fDot2L.x, fDot4R.w);
+
+  // CHECK: call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})  ; Dot2(ax,ay,bx,by)
+  fDotRes[1] = dot(fDot2L, fDot2R);
+
+  // CHECK: call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})  ; Dot3(ax,ay,az,bx,by,bz)
+  fDotRes[2] = dot(fDot3L, fDot3R);
+
+  // CHECK: call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})  ; Dot4(ax,ay,az,aw,bx,by,bz,bw)
+  fDotRes[3] = dot(fDot4L, fDot4R);
+
+  res += fDotRes;
 
   // CHECK: call float  @dx.op.unary.f32(i32 29, float  %{{.*}}) ; Round_z(value)
   // CHECK: call float  @dx.op.unary.f32(i32 29, float  %{{.*}}) ; Round_z(value)