Skip to content

Commit f9ba024

Browse files
Perf noinline (#1692)
* remvoe inline in perf benchmarks * follow suggestion to make links public Signed-off-by: Alexandre Eichenberger <[email protected]>
1 parent 372c069 commit f9ba024

File tree

8 files changed

+79
-39
lines changed

8 files changed

+79
-39
lines changed

docs/BuildOnWindows.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ call cmake %root_dir%\onnx-mlir -G "Ninja" ^
9999
-DLLVM_EXTERNAL_LIT=%lit_path% ^
100100
-DLLVM_LIT_ARGS=-v ^
101101
-DMLIR_DIR=%root_dir%\llvm-project\build\lib\cmake\mlir ^
102-
-DONNX_MLIR_BUILD_TESTS=OFF
102+
-DONNX_MLIR_BUILD_TESTS=ON
103103

104104
call cmake --build . --config Release --target onnx-mlir
105105
```

test/perf/CMakeLists.txt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3+
add_onnx_mlir_library(PerfLib
4+
PerfHelper.cpp
5+
EXCLUDE_FROM_OM_LIBS
6+
7+
INCLUDE_DIRS PRIVATE
8+
${ONNX_MLIR_SRC_ROOT}/third_party/benchmark/include
9+
${ONNX_MLIR_BIN_ROOT}/third_party/benchmark/include
10+
11+
LINK_LIBS PUBLIC
12+
benchmark
13+
CompilerUtils
14+
)
15+
316
add_custom_target(perf)
417
set_target_properties(perf PROPERTIES FOLDER "Perf")
518

@@ -37,7 +50,7 @@ endfunction()
3750

3851
# The CompilerUtils ExecutionSession are also included in ModelLib,
3952
# but it did not compile when I removed these two. TODO, figure out why.
40-
set(TEST_LINK_LIBS ModelLib CompilerUtils benchmark)
53+
set(TEST_LINK_LIBS ModelLib PerfLib)
4154

4255
add_perf_unittest(PerfGemm
4356
PerfGemm.cpp

test/perf/PerfConv.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ static void BM_Conv2D_C16_K1(benchmark::State &state) {
4242
for (auto _ : state)
4343
model.run();
4444
// FLOPS assume D=1, S=1.
45-
PERF_RECORD_FLOPS(2.0 * N * C * C * H * W * K * K);
45+
perf_recordFlops(state, 2.0 * N * C * C * H * W * K * K);
4646
}
4747
BENCHMARK(BM_Conv2D_C16_K1)
4848
->ArgsProduct({{1, 16, 64}, {16, 64, 256}})
@@ -64,7 +64,7 @@ static void BM_Conv2D_C16_K3(benchmark::State &state) {
6464
for (auto _ : state)
6565
model.run();
6666
// FLOPS assume D=1, S=1.
67-
PERF_RECORD_FLOPS(2.0 * N * C * C * H * W * K * K);
67+
perf_recordFlops(state, 2.0 * N * C * C * H * W * K * K);
6868
}
6969
BENCHMARK(BM_Conv2D_C16_K3)
7070
->ArgsProduct({{1, 16, 64}, {16, 64, 256}})

test/perf/PerfGemm.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ static void BM_MatrixVectorProduct(benchmark::State &state) {
3434
for (auto _ : state)
3535
model.run();
3636
state.SetComplexityN(I);
37-
PERF_RECORD_FLOPS(2.0 * I * J * K);
37+
perf_recordFlops(state, 2.0 * I * J * K);
3838
}
3939
BENCHMARK(BM_MatrixVectorProduct)
4040
->RangeMultiplier(2)
@@ -52,7 +52,7 @@ static void BM_MatmulSquare(benchmark::State &state) {
5252
for (auto _ : state)
5353
model.run();
5454
state.SetComplexityN(I);
55-
PERF_RECORD_FLOPS(2.0 * I * J * K);
55+
perf_recordFlops(state, 2.0 * I * J * K);
5656
}
5757
BENCHMARK(BM_MatmulSquare)
5858
->RangeMultiplier(2)
@@ -72,7 +72,7 @@ static void BM_MatmulSquareBroadcastB4x(benchmark::State &state) {
7272
for (auto _ : state)
7373
model.run();
7474
state.SetComplexityN(I);
75-
PERF_RECORD_FLOPS(/*broadcast 4x*/ 4.0 /*matmul*/ * 2.0 * I * J * K);
75+
perf_recordFlops(state, /*broadcast 4x*/ 4.0 /*matmul*/ * 2.0 * I * J * K);
7676
}
7777
BENCHMARK(BM_MatmulSquareBroadcastB4x)
7878
->RangeMultiplier(2)
@@ -92,7 +92,7 @@ static void BM_MatMulWithGemmSquare(benchmark::State &state) {
9292
model.run();
9393
state.SetComplexityN(I);
9494
// Because alpha is 1, its not counted; beta is zero, sum of B is ignored.
95-
PERF_RECORD_FLOPS(1.0 * I * J * (2.0 * K - 1.0));
95+
perf_recordFlops(state, 1.0 * I * J * (2.0 * K - 1.0));
9696
}
9797
BENCHMARK(BM_MatMulWithGemmSquare)
9898
->RangeMultiplier(2)
@@ -112,7 +112,7 @@ static void BM_GemmSquare(benchmark::State &state) {
112112
model.run();
113113
state.SetComplexityN(I);
114114
// Because alpha is 1, its not counted; beta is 1, sum of B is counted.
115-
PERF_RECORD_FLOPS(1.0 * I * J * (2.0 * K - 1.0) + I * K);
115+
perf_recordFlops(state, 1.0 * I * J * (2.0 * K - 1.0) + I * K);
116116
}
117117
BENCHMARK(BM_GemmSquare)
118118
->RangeMultiplier(2)

test/perf/PerfHelper.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*/
4+
5+
//===================-- PerfHelper.cpp - Helper for perf tests -=============//
6+
//
7+
// Copyright 2022 The IBM Research Authors.
8+
//
9+
// =============================================================================
10+
//
11+
// This file contains helper macro and functions for repetitive Benchmark
12+
// actions.
13+
//===----------------------------------------------------------------------===//
14+
15+
#include "llvm/Support/CommandLine.h"
16+
17+
#include "test/perf/PerfHelper.hpp"
18+
19+
// Pass f as a (double) number of FLOP in the measurement and report it as the
20+
// actual number (FLOP) and as a rate per seconds (FLOPS).
21+
void perf_recordFlops(benchmark::State &state, float f) {
22+
state.counters["FLOPS"] = benchmark::Counter(f,
23+
benchmark::Counter::kIsRate | benchmark::Counter::kIsIterationInvariant,
24+
benchmark::Counter::OneK::kIs1000);
25+
state.counters["FLOP"] = benchmark::Counter(
26+
f, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1000);
27+
}
28+
29+
// Define performance main, with default opt level of 3, and scan PERF_ARGS to
30+
// override default onnx-mlir compiler options.
31+
int perf_main(int argc, char **argv) {
32+
::benchmark::Initialize(&argc, argv);
33+
const int onnxMlirArgc = 2;
34+
const char *onnxMlirArgv[onnxMlirArgc];
35+
onnxMlirArgv[0] = argv[0];
36+
onnxMlirArgv[1] = "-O3";
37+
if (!llvm::cl::ParseCommandLineOptions(onnxMlirArgc, onnxMlirArgv,
38+
"set options for perf-algo", nullptr, /*env var*/ "PERF_ARGS"))
39+
return 2;
40+
if (::benchmark::ReportUnrecognizedArguments(argc, argv))
41+
return 1;
42+
::benchmark::RunSpecifiedBenchmarks();
43+
::benchmark::Shutdown();
44+
return 0;
45+
}

test/perf/PerfHelper.hpp

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,33 +12,15 @@
1212
// actions.
1313
//===----------------------------------------------------------------------===//
1414

15+
#include <benchmark/benchmark.h>
16+
1517
// Pass f as a (double) number of FLOP in the measurement and report it as the
1618
// actual number (FLOP) and as a rate per seconds (FLOPS).
17-
#define PERF_RECORD_FLOPS(_f) \
18-
{ \
19-
state.counters["FLOPS"] = benchmark::Counter((_f), \
20-
benchmark::Counter::kIsRate | \
21-
benchmark::Counter::kIsIterationInvariant, \
22-
benchmark::Counter::OneK::kIs1000); \
23-
state.counters["FLOP"] = benchmark::Counter((_f), \
24-
benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1000); \
25-
}
19+
void perf_recordFlops(benchmark::State &state, float f);
2620

2721
// Define performance main, with default opt level of 3, and scan PERF_ARGS to
2822
// override default onnx-mlir compiler options.
23+
int perf_main(int argc, char **argv);
24+
2925
#define PERF_MAIN() \
30-
int main(int argc, char **argv) { \
31-
::benchmark::Initialize(&argc, argv); \
32-
int onnxMlirArgc = 2; \
33-
const char *onnxMlirArgv[onnxMlirArgc]; \
34-
onnxMlirArgv[0] = argv[0]; \
35-
onnxMlirArgv[1] = "-O3"; \
36-
if (!llvm::cl::ParseCommandLineOptions(onnxMlirArgc, onnxMlirArgv, \
37-
"set options for perf-algo", nullptr, /*env var*/ "PERF_ARGS")) \
38-
return 2; \
39-
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) \
40-
return 1; \
41-
::benchmark::RunSpecifiedBenchmarks(); \
42-
::benchmark::Shutdown(); \
43-
return 0; \
44-
}
26+
int main(int argc, char **argv) { return perf_main(argc, argv); }

test/perf/PerfRNN.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ static void BM_LSTM(benchmark::State &state) {
7272
// FLOPS for LSTM: ignore activations, assume static S and B.
7373
// Eight matrix-matrix multiplications are combined into two
7474
// matrix-matrix multiplications: [B,I]x[I,4*H] and [B,H]x[H,4*H].
75-
PERF_RECORD_FLOPS(
75+
perf_recordFlops(state,
7676
D * S * (4.0 * B * H * (2.0 * I - 1.0) + 4.0 * B * H * (2.0 * H - 1.0)));
7777
}
7878
BENCHMARK(BM_LSTM)->Apply(CommonArgs)->Unit(benchmark::kMillisecond);
@@ -94,7 +94,7 @@ static void BM_GRU_LINEAR_BEFORE_RESET(benchmark::State &state) {
9494
// FLOPS for GRU: ignore activations, assume static S and B.
9595
// Six matrix-matrix multiplications are combined into two
9696
// matrix-matrix multiplications: [B,I]x[I,3*H] and [B,H]x[H,3*H].
97-
PERF_RECORD_FLOPS(
97+
perf_recordFlops(state,
9898
D * S * (3.0 * B * H * (2.0 * I - 1.0) + 3.0 * B * H * (2.0 * H - 1.0)));
9999
}
100100
BENCHMARK(BM_GRU_LINEAR_BEFORE_RESET)
@@ -118,7 +118,7 @@ static void BM_GRU_LINEAR_AFTER_RESET(benchmark::State &state) {
118118
// FLOPS for GRU: ignore activations, assume static S and B.
119119
// Six matrix-matrix multiplications are combined into two
120120
// matrix-matrix multiplications: [B,I]x[I,3*H] and [B,H]x[H,3*H].
121-
PERF_RECORD_FLOPS(
121+
perf_recordFlops(state,
122122
D * S * (3.0 * B * H * (2.0 * I - 1.0) + 3.0 * B * H * (2.0 * H - 1.0)));
123123
}
124124
BENCHMARK(BM_GRU_LINEAR_AFTER_RESET)
@@ -141,8 +141,8 @@ static void BM_RNN(benchmark::State &state) {
141141
rnn.run();
142142
// FLOPS for RNN: ignore activations, assume static S and B.
143143
// Two matrix-matrix multiplications: [B,I]x[I,H] and [B,H]x[H,H].
144-
PERF_RECORD_FLOPS(
145-
D * S * (B * H * (2.0 * I - 1.0) + B * H * (2.0 * H - 1.0)));
144+
perf_recordFlops(
145+
state, D * S * (B * H * (2.0 * I - 1.0) + B * H * (2.0 * H - 1.0)));
146146
}
147147
BENCHMARK(BM_RNN)->Apply(CommonArgs)->Unit(benchmark::kMillisecond);
148148

utils/build-onnx-mlir.cmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ call cmake %root_dir%\onnx-mlir -G "Ninja" ^
88
-DLLVM_EXTERNAL_LIT=%lit_path% ^
99
-DLLVM_LIT_ARGS=-v ^
1010
-DMLIR_DIR=%root_dir%\llvm-project\build\lib\cmake\mlir ^
11-
-DONNX_MLIR_BUILD_TESTS=OFF
11+
-DONNX_MLIR_BUILD_TESTS=ON
1212

1313
call cmake --build . --config Release --target onnx-mlir

0 commit comments

Comments
 (0)