Skip to content

Commit ecff6e3

Browse files
authored
Improve: Include Asm tests into macOS Arm builds (#45)
1 parent 966d168 commit ecff6e3

File tree

3 files changed

+28
-17
lines changed

3 files changed

+28
-17
lines changed

CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,10 @@ set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON)
343343
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64")
344344
set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM)
345345
target_sources(less_slow PRIVATE less_slow_amd64.S)
346-
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
346+
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64")
347+
if (APPLE)
348+
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv8.6-a+bf16")
349+
endif()
347350
set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
348351
target_sources(less_slow PRIVATE less_slow_aarch64.S)
349352
endif ()

less_slow.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1352,7 +1352,7 @@ BENCHMARK(integral_division_with_doubles);
13521352
* while the internal logic remains identical.
13531353
*/
13541354

1355-
#if defined(__GNUC__) && !defined(__clang__)
1355+
#if defined(__GNUC__)
13561356

13571357
#if defined(__x86_64__) || defined(__i386__)
13581358
[[gnu::target("arch=core2")]]

less_slow_aarch64.S

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,27 @@
33
# Micro-kernels for building a performance-first mindset for 64-bit ARM (NEON).
44
# ----------------------------------------------------------------------------
55

6-
.section .text
7-
.global i32_add_asm_kernel
6+
#ifdef __APPLE__
7+
#define SYMBOL_NAME(name) _##name // Add underscore on macOS
8+
#else
9+
#define SYMBOL_NAME(name) name // No underscore on GNU-based systems
10+
#endif
11+
12+
.text
13+
.global SYMBOL_NAME(i32_add_asm_kernel)
814
9-
.global tops_f64_neon_asm_kernel
10-
.global tops_f32_neon_asm_kernel
11-
.global tops_f16_neon_asm_kernel
12-
.global tops_bf16_neon_asm_kernel
13-
.global tops_i8_neon_asm_kernel
14-
.global tops_u8_neon_asm_kernel
15+
.global SYMBOL_NAME(tops_f64_neon_asm_kernel)
16+
.global SYMBOL_NAME(tops_f32_neon_asm_kernel)
17+
.global SYMBOL_NAME(tops_f16_neon_asm_kernel)
18+
.global SYMBOL_NAME(tops_bf16_neon_asm_kernel)
19+
.global SYMBOL_NAME(tops_i8_neon_asm_kernel)
20+
.global SYMBOL_NAME(tops_u8_neon_asm_kernel)
1521

1622
# ----------------------------------------------------------------------------
1723
# Simple function that adds two 32-bit integers.
1824
# AArch64 ABI: W0 = 'a', W1 = 'b'. Return in W0.
1925
# ----------------------------------------------------------------------------
20-
i32_add_asm_kernel:
26+
SYMBOL_NAME(i32_add_asm_kernel):
2127
add w0, w0, w1
2228
ret
2329

@@ -26,7 +32,7 @@ i32_add_asm_kernel:
2632
# Each FMLA vD.2d, vN.2d, vM.2d => 2 multiplies + 2 adds = 4 FLOPs.
2733
# We'll do 10 instructions => 10 × 4 = 40 FLOPs total, returning 40 in W0.
2834
# ----------------------------------------------------------------------------
29-
tops_f64_neon_asm_kernel:
35+
SYMBOL_NAME(tops_f64_neon_asm_kernel):
3036
fmla v0.2d, v1.2d, v2.2d
3137
fmla v3.2d, v4.2d, v5.2d
3238
fmla v6.2d, v7.2d, v8.2d
@@ -47,7 +53,7 @@ tops_f64_neon_asm_kernel:
4753
# Let's do 10 instructions => 10 × 8 = 80 FLOPs total.
4854
# Return 80 in W0.
4955
# ----------------------------------------------------------------------------
50-
tops_f32_neon_asm_kernel:
56+
SYMBOL_NAME(tops_f32_neon_asm_kernel):
5157
fmla v0.4s, v1.4s, v2.4s
5258
fmla v3.4s, v4.4s, v5.4s
5359
fmla v6.4s, v7.4s, v8.4s
@@ -68,7 +74,7 @@ tops_f32_neon_asm_kernel:
6874
# Each FMLA vD.8h, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs.
6975
# We'll do 10 instructions => 160 FLOPs total, returning 160 in W0.
7076
# ----------------------------------------------------------------------------
71-
tops_f16_neon_asm_kernel:
77+
SYMBOL_NAME(tops_f16_neon_asm_kernel):
7278
fmla v0.8h, v1.8h, v2.8h
7379
fmla v3.8h, v4.8h, v5.8h
7480
fmla v6.8h, v7.8h, v8.8h
@@ -89,7 +95,7 @@ tops_f16_neon_asm_kernel:
8995
# bfmmla vD.4s, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs.
9096
# We'll do 10 instructions => 160 FLOPs total, returning 160 in W0.
9197
# ----------------------------------------------------------------------------
92-
tops_bf16_neon_asm_kernel:
98+
SYMBOL_NAME(tops_bf16_neon_asm_kernel):
9399
bfmmla v0.4s, v1.8h, v2.8h
94100
bfmmla v3.4s, v4.8h, v5.8h
95101
bfmmla v6.4s, v7.8h, v8.8h
@@ -110,7 +116,7 @@ tops_bf16_neon_asm_kernel:
110116
# sdot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs.
111117
# We'll do 10 instructions => 320 FLOPs total, returning 320 in W0.
112118
# ----------------------------------------------------------------------------
113-
tops_i8_neon_asm_kernel:
119+
SYMBOL_NAME(tops_i8_neon_asm_kernel):
114120
sdot v0.4s, v1.16b, v2.16b
115121
sdot v3.4s, v4.16b, v5.16b
116122
sdot v6.4s, v7.16b, v8.16b
@@ -131,7 +137,7 @@ tops_i8_neon_asm_kernel:
131137
# udot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs.
132138
# We'll do 10 instructions => 320 FLOPs total, returning 320 in W0.
133139
# ----------------------------------------------------------------------------
134-
tops_u8_neon_asm_kernel:
140+
SYMBOL_NAME(tops_u8_neon_asm_kernel):
135141
udot v0.4s, v1.16b, v2.16b
136142
udot v3.4s, v4.16b, v5.16b
137143
udot v6.4s, v7.16b, v8.16b
@@ -148,5 +154,7 @@ tops_u8_neon_asm_kernel:
148154

149155
# ----------------------------------------------------------------------------
150156
# Tell the linker/assembler that we do NOT need an executable stack:
157+
#ifdef __linux__
151158
.section .note.GNU-stack, "", @progbits
159+
#endif
152160
# ----------------------------------------------------------------------------

0 commit comments

Comments
 (0)