@@ -5270,7 +5270,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52705270 return LoopBB;
52715271}
52725272
5273- static uint32_t getIdentityValueForWaveReduction (unsigned Opc) {
5273+ static uint32_t getIdentityValueFor32BitWaveReduction (unsigned Opc) {
52745274 switch (Opc) {
52755275 case AMDGPU::S_MIN_U32:
52765276 return std::numeric_limits<uint32_t>::max();
@@ -5288,10 +5288,35 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
52885288 case AMDGPU::S_AND_B32:
52895289 return std::numeric_limits<uint32_t>::max();
52905290 default:
5291- llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5291+ llvm_unreachable(
5292+ "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
52925293 }
52935294}
52945295
5296+ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5297+ switch (Opc) {
5298+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5299+ return std::numeric_limits<uint64_t>::max();
5300+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5301+ return std::numeric_limits<int64_t>::max();
5302+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5303+ return std::numeric_limits<uint64_t>::min();
5304+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5305+ return std::numeric_limits<int64_t>::min();
5306+ default:
5307+ llvm_unreachable(
5308+ "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5309+ }
5310+ }
5311+
5312+ static bool is32bitWaveReduceOperation(unsigned Opc) {
5313+ return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5314+ Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5315+ Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5316+ Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5317+ Opc == AMDGPU::S_XOR_B32;
5318+ }
5319+
52955320static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52965321 MachineBasicBlock &BB,
52975322 const GCNSubtarget &ST,
@@ -5319,6 +5344,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53195344 RetBB = &BB;
53205345 break;
53215346 }
5347+ case AMDGPU::V_CMP_LT_U64_e64: // umin
5348+ case AMDGPU::V_CMP_LT_I64_e64: // min
5349+ case AMDGPU::V_CMP_GT_U64_e64: // umax
5350+ case AMDGPU::V_CMP_GT_I64_e64: { // max
5351+ // Idempotent operations.
5352+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5353+ RetBB = &BB;
5354+ break;
5355+ }
53225356 case AMDGPU::S_XOR_B32:
53235357 case AMDGPU::S_ADD_I32:
53245358 case AMDGPU::S_SUB_I32: {
@@ -5391,6 +5425,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53915425 // so that we will get the next active lane for next iteration.
53925426 MachineBasicBlock::iterator I = BB.end();
53935427 Register SrcReg = MI.getOperand(1).getReg();
5428+ bool is32BitOpc = is32bitWaveReduceOperation(Opc);
53945429
53955430 // Create Control flow for loop
53965431 // Split MI's Machine Basic block into For loop
@@ -5400,73 +5435,144 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54005435 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
54015436 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
54025437 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5403- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5404-
5438+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
54055439 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
54065440 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
54075441 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5408-
5409- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5410- Register LaneValueReg =
5411- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5442+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5443+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
54125444
54135445 bool IsWave32 = ST.isWave32();
5414- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5446+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
54155447 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
54165448
54175449 // Create initial values of induction variable from Exec, Accumulator and
54185450 // insert branch instr to newly created ComputeBlock
5419- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5420- auto TmpSReg =
5421- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5422- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5423- .addImm(InitalValue);
5451+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5452+ if (is32BitOpc) {
5453+ uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
5454+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5455+ .addImm(IdentityValue);
5456+ } else {
5457+ uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
5458+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5459+ .addImm(IdentityValue);
5460+ }
54245461 // clang-format off
54255462 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
54265463 .addMBB(ComputeLoop);
54275464 // clang-format on
54285465
54295466 // Start constructing ComputeLoop
5430- I = ComputeLoop->end ();
5467+ I = ComputeLoop->begin ();
54315468 auto Accumulator =
54325469 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5433- .addReg(InitalValReg )
5470+ .addReg(IdentityValReg )
54345471 .addMBB(&BB);
54355472 auto ActiveBits =
54365473 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5437- .addReg(TmpSReg->getOperand(0).getReg() )
5474+ .addReg(LoopIterator )
54385475 .addMBB(&BB);
54395476
5477+ I = ComputeLoop->end();
5478+ MachineInstr *NewAccumulator;
54405479 // Perform the computations
54415480 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5442- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5443- .addReg(ActiveBits->getOperand(0).getReg());
5444- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5445- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5446- .addReg(SrcReg)
5447- .addReg(FF1->getOperand(0).getReg());
5448- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5449- .addReg(Accumulator->getOperand(0).getReg())
5450- .addReg(LaneValue->getOperand(0).getReg());
5451-
5481+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5482+ .addReg(ActiveBitsReg);
5483+ if (is32BitOpc) {
5484+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5485+ LaneValueReg)
5486+ .addReg(SrcReg)
5487+ .addReg(FF1Reg);
5488+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5489+ .addReg(Accumulator->getOperand(0).getReg())
5490+ .addReg(LaneValueReg);
5491+ } else {
5492+ Register LaneValueLoReg =
5493+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5494+ Register LaneValueHiReg =
5495+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5496+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5497+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5498+ const TargetRegisterClass *SrcSubRC =
5499+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5500+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5501+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5502+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5503+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5504+ // lane value input should be in an sgpr
5505+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5506+ LaneValueLoReg)
5507+ .add(Op1L)
5508+ .addReg(FF1Reg);
5509+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5510+ LaneValueHiReg)
5511+ .add(Op1H)
5512+ .addReg(FF1Reg);
5513+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5514+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5515+ .addReg(LaneValueLoReg)
5516+ .addImm(AMDGPU::sub0)
5517+ .addReg(LaneValueHiReg)
5518+ .addImm(AMDGPU::sub1);
5519+ switch (Opc) {
5520+ case AMDGPU::V_CMP_GT_I64_e64:
5521+ case AMDGPU::V_CMP_GT_U64_e64:
5522+ case AMDGPU::V_CMP_LT_I64_e64:
5523+ case AMDGPU::V_CMP_LT_U64_e64: {
5524+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5525+ Register ComparisonResultReg =
5526+ MRI.createVirtualRegister(WaveMaskRegClass);
5527+ const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5528+ const TargetRegisterClass *VSubRegClass =
5529+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5530+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5531+ MachineOperand SrcReg0Sub0 =
5532+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5533+ VregClass, AMDGPU::sub0, VSubRegClass);
5534+ MachineOperand SrcReg0Sub1 =
5535+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5536+ VregClass, AMDGPU::sub1, VSubRegClass);
5537+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5538+ AccumulatorVReg)
5539+ .add(SrcReg0Sub0)
5540+ .addImm(AMDGPU::sub0)
5541+ .add(SrcReg0Sub1)
5542+ .addImm(AMDGPU::sub1);
5543+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5544+ .addReg(LaneValue->getOperand(0).getReg())
5545+ .addReg(AccumulatorVReg);
5546+
5547+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5548+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5549+ .addReg(LaneMaskReg)
5550+ .addReg(ActiveBitsReg);
5551+
5552+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5553+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5554+ .addReg(LaneValue->getOperand(0).getReg())
5555+ .addReg(Accumulator->getOperand(0).getReg());
5556+ break;
5557+ }
5558+ }
5559+ }
54525560 // Manipulate the iterator to get the next active lane
54535561 unsigned BITSETOpc =
54545562 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5455- auto NewActiveBits =
5456- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5457- .addReg(FF1->getOperand(0).getReg())
5458- .addReg(ActiveBits->getOperand(0).getReg());
5563+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5564+ .addReg(FF1Reg)
5565+ .addReg(ActiveBitsReg);
54595566
54605567 // Add phi nodes
54615568 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
54625569 .addMBB(ComputeLoop);
5463- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5464- .addMBB(ComputeLoop);
5570+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
54655571
54665572 // Creating branching
54675573 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
54685574 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5469- .addReg(NewActiveBits->getOperand(0).getReg() )
5575+ .addReg(NewActiveBitsReg )
54705576 .addImm(0);
54715577 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
54725578 .addMBB(ComputeLoop);
@@ -5488,12 +5594,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54885594 switch (MI.getOpcode()) {
54895595 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
54905596 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5597+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5598+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
54915599 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
54925600 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5601+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5602+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
54935603 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
54945604 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5605+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5606+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
54955607 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
54965608 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5609+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5610+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54975611 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54985612 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
54995613 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
0 commit comments