[Clang][AArch64] Implement widening FMMLA intrinsics #165282

Amichaxx · 2025-10-27T17:19:56Z

Proposed in this ACLE proposal, this PR implements widening FMMLA intrinsics.

F16 to F32
MF8 to F32
MF8 to F16

Additional changes:

IsOverloadCvt flag renamed to IsOverloadFirstandLast for clarity, as the name implies conversion. Implementation remains unchanged.

- F16 to F32 - MF8 to F32 - MF8 to F16

llvmbot · 2025-10-27T17:20:29Z

@llvm/pr-subscribers-clang-codegen
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-aarch64

Author: Amina Chabane (Amichaxx)

Changes

Proposed in this ACLE proposal, this PR implements widening FMMLA intrinsics.

F16 to F32
MF8 to F32
MF8 to F16

Full diff: https://github.com/llvm/llvm-project/pull/165282.diff

9 Files Affected:

(modified) clang/include/clang/Basic/arm_sve.td (+12)
(added) clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c (+33)
(added) clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c (+35)
(added) clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c (+36)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+14)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+5-1)
(added) llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll (+32)
(added) llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll (+39)
(added) llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll (+41)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..c63da3308d6a0 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1196,6 +1196,18 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla
 let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F64 : SInst<"svmmla[_f64]",  "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
 
+let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
+  def SVMLLA_F32_F16  : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
+  def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
+  def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;
+}
+
 def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
 def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">;
 def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">;
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
new file mode 100644
index 0000000000000..bebaa059e5c84
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
@@ -0,0 +1,33 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
+// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
+  return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
new file mode 100644
index 0000000000000..a19ad0576bb4b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
+  return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
new file mode 100644
index 0000000000000..526f2b1f45927
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
+  return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..832f97fc95959 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2807,6 +2807,20 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
 //
 def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
 
+def int_aarch64_sve_fmmla_f16f32
+    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+                          [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ],
+                          [IntrNoMem]>;
+
+def int_aarch64_sve_fmmla_mf8f32
+  : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+                          [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
+                          [IntrNoMem]>;
+
+def int_aarch64_sve_fmmla_mf8f16
+  : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
+                          [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
+                          [IntrNoMem]>;                     
 //
 // SVE ACLE: 7.2. BFloat16 extensions
 //
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..c756873d0bf7e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in {
 } // End HasSVE, HasMatMulFP32
 
 let Predicates = [HasSVE_F16F32MM] in {
-  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;
+  defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>;
 } // End HasSVE_F16F32MM
 
 let Predicates = [HasSVE, HasMatMulFP64] in {
@@ -4745,10 +4745,14 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_
 
 let Predicates = [HasSVE2, HasF8F32MM] in {
   def FMMLA_ZZZ_BtoS :  sve2_fp8_mmla<0b0, ZPR32, "fmmla">;
+  def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)), 
+        (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>;
 }
 
 let Predicates = [HasSVE2, HasF8F16MM] in {
   def FMMLA_ZZZ_BtoH :  sve2_fp8_mmla<0b1, ZPR16, "fmmla">;
+  def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+        (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>;
 }
 
 let Predicates = [HasSSVE_FP8DOT2] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
new file mode 100644
index 0000000000000..ea636d65a479c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 4 x float> @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    str z0, [sp, #2, mul vl]
+; CHECK-NEXT:    fmmla z0.s, z1.h, z2.h
+; CHECK-NEXT:    str z1, [sp, #1, mul vl]
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %acc.addr = alloca <vscale x 4 x float>, align 16
+  %a.addr = alloca <vscale x 8 x half>, align 16
+  %b.addr = alloca <vscale x 8 x half>, align 16
+  store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
+  store <vscale x 8 x half> %a, ptr %a.addr, align 16
+  store <vscale x 8 x half> %b, ptr %b.addr, align 16
+  %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
+  %1 = load <vscale x 8 x half>, ptr %a.addr, align 16
+  %2 = load <vscale x 8 x half>, ptr %b.addr, align 16
+  %3 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+  ret <vscale x 4 x float> %3
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
new file mode 100644
index 0000000000000..0fdd6bf2508e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 8 x half> @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
+; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x8, sp, #3
+; CHECK-NEXT:    str z1, [sp, #1, mul vl]
+; CHECK-NEXT:    str z0, [sp, #2, mul vl]
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    str x0, [x8, #8]
+; CHECK-NEXT:    msr FPMR, x0
+; CHECK-NEXT:    fmmla z0.h, z1.b, z2.b
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %acc.addr = alloca <vscale x 8 x half>, align 16
+  %a.addr = alloca <vscale x 16 x i8>, align 16
+  %b.addr = alloca <vscale x 16 x i8>, align 16
+  %fpmr.addr = alloca i64, align 8
+  store <vscale x 8 x half> %acc, ptr %acc.addr, align 16
+  store <vscale x 16 x i8> %a, ptr %a.addr, align 16
+  store <vscale x 16 x i8> %b, ptr %b.addr, align 16
+  store i64 %fpmr, ptr %fpmr.addr, align 8
+  %0 = load <vscale x 8 x half>, ptr %acc.addr, align 16
+  %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
+  %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
+  %3 = load i64, ptr %fpmr.addr, align 8
+  call void @llvm.aarch64.set.fpmr(i64 %3)
+  %4 = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
+  ret <vscale x 8 x half> %4
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
new file mode 100644
index 0000000000000..007a164ac77da
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define dso_local <vscale x 4 x float> @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
+; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x8, sp, #3
+; CHECK-NEXT:    str z1, [sp, #1, mul vl]
+; CHECK-NEXT:    str z0, [sp, #2, mul vl]
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    str x0, [x8, #8]
+; CHECK-NEXT:    msr FPMR, x0
+; CHECK-NEXT:    fmmla z0.s, z1.b, z2.b
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %acc.addr = alloca <vscale x 4 x float>, align 16
+  %a.addr = alloca <vscale x 16 x i8>, align 16
+  %b.addr = alloca <vscale x 16 x i8>, align 16
+  %fpmr.addr = alloca i64, align 8
+  store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
+  store <vscale x 16 x i8> %a, ptr %a.addr, align 16
+  store <vscale x 16 x i8> %b, ptr %b.addr, align 16
+  store i64 %fpmr, ptr %fpmr.addr, align 8
+  %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
+  %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
+  %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
+  %3 = load i64, ptr %fpmr.addr, align 8
+  call void @llvm.aarch64.set.fpmr(i64 %3)
+  %4 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
+  ret <vscale x 4 x float> %4
+}
+
+declare void @llvm.aarch64.set.fpmr(i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i8>)

llvmbot · 2025-10-27T17:20:29Z

@llvm/pr-subscribers-llvm-ir

Author: Amina Chabane (Amichaxx)

Changes

Proposed in this ACLE proposal, this PR implements widening FMMLA intrinsics.

F16 to F32
MF8 to F32
MF8 to F16

Full diff: https://github.com/llvm/llvm-project/pull/165282.diff

9 Files Affected:

(modified) clang/include/clang/Basic/arm_sve.td (+12)
(added) clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c (+33)
(added) clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c (+35)
(added) clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c (+36)
(modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+14)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+5-1)
(added) llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll (+32)
(added) llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll (+39)
(added) llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll (+41)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..c63da3308d6a0 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1196,6 +1196,18 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla
 let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F64 : SInst<"svmmla[_f64]",  "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
 
+let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
+  def SVMLLA_F32_F16  : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
+  def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
+  def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;
+}
+
 def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
 def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">;
 def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">;
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
new file mode 100644
index 0000000000000..bebaa059e5c84
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
@@ -0,0 +1,33 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
+// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
+  return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
new file mode 100644
index 0000000000000..a19ad0576bb4b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
+  return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
new file mode 100644
index 0000000000000..526f2b1f45927
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
+  return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..832f97fc95959 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2807,6 +2807,20 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
 //
 def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
 
+def int_aarch64_sve_fmmla_f16f32
+    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+                          [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ],
+                          [IntrNoMem]>;
+
+def int_aarch64_sve_fmmla_mf8f32
+  : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+                          [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
+                          [IntrNoMem]>;
+
+def int_aarch64_sve_fmmla_mf8f16
+  : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
+                          [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
+                          [IntrNoMem]>;                     
 //
 // SVE ACLE: 7.2. BFloat16 extensions
 //
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..c756873d0bf7e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in {
 } // End HasSVE, HasMatMulFP32
 
 let Predicates = [HasSVE_F16F32MM] in {
-  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;
+  defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>;
 } // End HasSVE_F16F32MM
 
 let Predicates = [HasSVE, HasMatMulFP64] in {
@@ -4745,10 +4745,14 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_
 
 let Predicates = [HasSVE2, HasF8F32MM] in {
   def FMMLA_ZZZ_BtoS :  sve2_fp8_mmla<0b0, ZPR32, "fmmla">;
+  def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)), 
+        (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>;
 }
 
 let Predicates = [HasSVE2, HasF8F16MM] in {
   def FMMLA_ZZZ_BtoH :  sve2_fp8_mmla<0b1, ZPR16, "fmmla">;
+  def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+        (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>;
 }
 
 let Predicates = [HasSSVE_FP8DOT2] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
new file mode 100644
index 0000000000000..ea636d65a479c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 4 x float> @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    str z0, [sp, #2, mul vl]
+; CHECK-NEXT:    fmmla z0.s, z1.h, z2.h
+; CHECK-NEXT:    str z1, [sp, #1, mul vl]
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %acc.addr = alloca <vscale x 4 x float>, align 16
+  %a.addr = alloca <vscale x 8 x half>, align 16
+  %b.addr = alloca <vscale x 8 x half>, align 16
+  store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
+  store <vscale x 8 x half> %a, ptr %a.addr, align 16
+  store <vscale x 8 x half> %b, ptr %b.addr, align 16
+  %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
+  %1 = load <vscale x 8 x half>, ptr %a.addr, align 16
+  %2 = load <vscale x 8 x half>, ptr %b.addr, align 16
+  %3 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+  ret <vscale x 4 x float> %3
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
new file mode 100644
index 0000000000000..0fdd6bf2508e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 8 x half> @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
+; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x8, sp, #3
+; CHECK-NEXT:    str z1, [sp, #1, mul vl]
+; CHECK-NEXT:    str z0, [sp, #2, mul vl]
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    str x0, [x8, #8]
+; CHECK-NEXT:    msr FPMR, x0
+; CHECK-NEXT:    fmmla z0.h, z1.b, z2.b
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %acc.addr = alloca <vscale x 8 x half>, align 16
+  %a.addr = alloca <vscale x 16 x i8>, align 16
+  %b.addr = alloca <vscale x 16 x i8>, align 16
+  %fpmr.addr = alloca i64, align 8
+  store <vscale x 8 x half> %acc, ptr %acc.addr, align 16
+  store <vscale x 16 x i8> %a, ptr %a.addr, align 16
+  store <vscale x 16 x i8> %b, ptr %b.addr, align 16
+  store i64 %fpmr, ptr %fpmr.addr, align 8
+  %0 = load <vscale x 8 x half>, ptr %acc.addr, align 16
+  %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
+  %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
+  %3 = load i64, ptr %fpmr.addr, align 8
+  call void @llvm.aarch64.set.fpmr(i64 %3)
+  %4 = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
+  ret <vscale x 8 x half> %4
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
new file mode 100644
index 0000000000000..007a164ac77da
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define dso_local <vscale x 4 x float> @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
+; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x8, sp, #3
+; CHECK-NEXT:    str z1, [sp, #1, mul vl]
+; CHECK-NEXT:    str z0, [sp, #2, mul vl]
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    str x0, [x8, #8]
+; CHECK-NEXT:    msr FPMR, x0
+; CHECK-NEXT:    fmmla z0.s, z1.b, z2.b
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %acc.addr = alloca <vscale x 4 x float>, align 16
+  %a.addr = alloca <vscale x 16 x i8>, align 16
+  %b.addr = alloca <vscale x 16 x i8>, align 16
+  %fpmr.addr = alloca i64, align 8
+  store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
+  store <vscale x 16 x i8> %a, ptr %a.addr, align 16
+  store <vscale x 16 x i8> %b, ptr %b.addr, align 16
+  store i64 %fpmr, ptr %fpmr.addr, align 8
+  %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
+  %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
+  %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
+  %3 = load i64, ptr %fpmr.addr, align 8
+  call void @llvm.aarch64.set.fpmr(i64 %3)
+  %4 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
+  ret <vscale x 4 x float> %4
+}
+
+declare void @llvm.aarch64.set.fpmr(i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i8>)

clang/include/clang/Basic/arm_sve.td

llvm/include/llvm/IR/IntrinsicsAArch64.td

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll

clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c

…ntrinsics - Prototype cleanups - Updated ll tests to remove unnecessary IR - Removed unused arguments in clang test macros - Removed redundant check lines in ll tests

llvm/include/llvm/IR/IntrinsicsAArch64.td

…ingly Formatting

clang/include/clang/Basic/arm_sve.td

llvm/include/llvm/IR/IntrinsicsAArch64.td

Renamed IsOverloadCvt flag to IsOverloadFirstandLast for clarity

github-actions · 2025-11-17T16:06:07Z

✅ With the latest revision this PR passed the C/C++ code formatter.

efriedma-quic · 2025-11-17T20:10:00Z

Has there been any further discussion about the semantics of llvm.aarch64.set.fpmr? It looks like there was some discussion in #114248, but it wasn't ever really resolved. It seems problematic to allow users to explicitly control the state at the IR level, as opposed to letting the compiler configure it in the backend.

Lukacma · 2025-11-18T15:17:54Z

So the discussion you linked, seems to be about whether to add data dependency between set.fpmr and its use and whether the preserve memory side effect of it. I think in the current design we decided use memory effects to express the dependency, but @CarolineConcatto is implementing it, so she might be able to explain better why we went in that direction.

It seems problematic to allow users to explicitly control the state at the IR level, as opposed to letting the compiler configure it in the backend.

I don't think I quite follow what you mean here ? I might be completely wrong here, but there is no way for compiler to know how to setup fields in that register, so we need to rely on user to do it himself corretly ?

efriedma-quic · 2025-11-18T22:48:34Z

See https://reviews.llvm.org/D113439 for similar discussion we had for RISC-V intrinsics. Basically the same issues apply here. The end result was that the intrinsics were reworked into their current form: the intrinsics take the mode as an argument, and an MIR pass converts the mode operand into explicit instructions to set the mode.

CarolineConcatto · 2025-11-19T17:48:59Z

Hi @efriedma-quic,
So we are working towards the memory dependency.
I wrote and RFC:
https://discourse.llvm.org/t/rfc-improving-fpmr-handling-for-fp8-intrinsics-in-llvm/86868/6
to check how to improve the handling of set.fprm.
The suggestion was to do it through memory location in llvm(the store solution was rejected and the data dependency is not enough to block wrong optimisations)
The fprm register will map to one of the target locations and with that we can use some optimisations and also guarantee the order of the calls.
Here is the first patch:
#148650
I am working now towards improving alias analysis so it better disambiguates between target memory locations.
When this is done, and at some point soon I will have to change the llvm-ir of the instructions that use fprm to use the new memory model.
Does this help to answer your question?

CarolineConcatto · 2025-11-19T18:09:05Z

Another thin @efriedma-quic
I saw the comments about creating an intrinsic for each rouding mode, but we believe that is not enough for the FP8 instruction. Because it is not setting only the routing mode that FPMR set, but also scales, overflow and saturations. We've consider that solution as well, but I think that main reason was that this would be more complex for the user than just having one intrinsic for each instruction and setting fpmr for what was desired

github-actions · 2025-11-19T18:59:33Z

🐧 Linux x64 Test Results

192860 tests passed
6180 tests skipped

efriedma-quic · 2025-11-19T19:44:39Z

I'll redirect the discussion to the RFC thread... I can go into a little more detail about what exactly I'm worried about.

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lukacma

LGTM. Just wait with merging this patch until ACLE is merged in case there are further changes there.

Proposed in [this ACLE proposal](ARM-software/acle#409), this PR implements widening FMMLA intrinsics. - F16 to F32 - MF8 to F32 - MF8 to F16 Additional changes: - IsOverloadCvt flag renamed to IsOverloadFirstandLast for clarity, as the name implies conversion. Implementation remains unchanged.

Implement widening FMMLA intrinsics

9004ff2

- F16 to F32 - MF8 to F32 - MF8 to F16

llvmbot added clang Clang issues not falling into any other category backend:AArch64 clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:ir labels Oct 27, 2025

Amichaxx changed the title ~~Implement widening FMMLA intrinsics~~ [Clang][AArch64] Implement widening FMMLA intrinsics Oct 28, 2025

Lukacma reviewed Oct 29, 2025

View reviewed changes

Lukacma requested a review from CarolineConcatto October 29, 2025 14:35

kmclaughlin-arm reviewed Nov 11, 2025

View reviewed changes

llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll Outdated Show resolved Hide resolved

clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c Outdated Show resolved Hide resolved

- Implemented overloading for fmmla intrinsics, replaced fixed-type i…

4b37033

…ntrinsics - Prototype cleanups - Updated ll tests to remove unnecessary IR - Removed unused arguments in clang test macros - Removed redundant check lines in ll tests

kmclaughlin-arm reviewed Nov 12, 2025

View reviewed changes

llvm/include/llvm/IR/IntrinsicsAArch64.td Outdated Show resolved Hide resolved

Added new fp8 intrinsic in IntrinsicsAArch64.td, updated tests accord…

308d322

…ingly Formatting

Lukacma reviewed Nov 14, 2025

View reviewed changes

clang/include/clang/Basic/arm_sve.td Outdated Show resolved Hide resolved

llvm/include/llvm/IR/IntrinsicsAArch64.td Outdated Show resolved Hide resolved

Changed source type to fp8

eb6c322

Renamed IsOverloadCvt flag to IsOverloadFirstandLast for clarity

llvmbot added the clang:codegen IR generation bugs: mangling, exceptions, etc. label Nov 17, 2025

clang format

5132092

CarolineConcatto closed this Nov 19, 2025

CarolineConcatto reopened this Nov 19, 2025

Lukacma reviewed Nov 20, 2025

View reviewed changes

llvm/include/llvm/IR/IntrinsicsAArch64.td Outdated Show resolved Hide resolved

- fp8_fmmla changes

5052f47

Amichaxx force-pushed the fmmla-intrinsics-acle branch from 89d24ff to 5052f47 Compare November 20, 2025 14:04

Lukacma approved these changes Nov 20, 2025

View reviewed changes

CarolineConcatto approved these changes Dec 1, 2025

View reviewed changes

CarolineConcatto merged commit 7f2e6f1 into llvm:main Dec 5, 2025
10 checks passed

[Clang][AArch64] Implement widening FMMLA intrinsics #165282

[Clang][AArch64] Implement widening FMMLA intrinsics #165282

Uh oh!

Conversation

Amichaxx commented Oct 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 27, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Nov 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

efriedma-quic commented Nov 17, 2025

Uh oh!

Lukacma commented Nov 18, 2025

Uh oh!

efriedma-quic commented Nov 18, 2025

Uh oh!

CarolineConcatto commented Nov 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

CarolineConcatto commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

efriedma-quic commented Nov 19, 2025

Uh oh!

Uh oh!

Lukacma left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

Amichaxx commented Oct 27, 2025 •

edited

Loading

llvmbot commented Oct 27, 2025 •

edited

Loading

github-actions bot commented Nov 17, 2025 •

edited

Loading

CarolineConcatto commented Nov 19, 2025 •

edited

Loading

github-actions bot commented Nov 19, 2025 •

edited

Loading