Skip to content

Commit cfeddd7

Browse files
authored
Inline many (#1125)
* Improve inlining pass to inline single-use functions that are fairly small, which makes it useful for removing unnecessary global constructors from clang. * Add an inlining-optimizing pass that also optimizes where it inlined, as new opportunities arise. enable that it by default in O2+ * In addition, in -O3+ also inline small functions with multiple uses. This helps a lot with things like safe-int-divide functions (where each int divide is replaced by a safe divide that won't trap). Inlining gets rid of around half of the overhead there.
1 parent ad8bc65 commit cfeddd7

File tree

6 files changed

+31825
-64
lines changed

6 files changed

+31825
-64
lines changed

src/passes/Inlining.cpp

Lines changed: 94 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,15 @@
1717
//
1818
// Inlining.
1919
//
20-
// For now, this does a conservative inlining of all functions that have
20+
// By default, this does a conservative inlining of all functions that have
2121
// exactly one use, and are fairly small. That should not increase code
2222
// size, and may have speed benefits.
2323
//
24+
// When opt level is 3+ (-O3 or above), we more aggressively inline
25+
// even functions with more than one use, that seem to be "lightweight"
26+
// (no loops or calls etc.), so inlining them may get rid of call overhead
27+
// that would be noticeable otherwise
28+
//
2429

2530
#include <atomic>
2631

@@ -33,42 +38,66 @@
3338

3439
namespace wasm {
3540

36-
// A limit on how big a function to inline.
37-
static const int INLINING_SIZE_LIMIT = 15;
41+
// A limit on how big a function to inline when being careful about size
42+
static const int CAREFUL_SIZE_LIMIT = 15;
43+
44+
// A limit on how big a function to inline when being more flexible. In
45+
// particular it's nice that with this limit we can inline the clamp
46+
// functions (i32s-div, f64-to-int, etc.), that can affect perf.
47+
static const int FLEXIBLE_SIZE_LIMIT = 20;
48+
49+
// Useful into on a function, helping us decide if we can inline it
50+
struct FunctionInfo {
51+
std::atomic<Index> calls;
52+
Index size;
53+
bool lightweight = true;
54+
bool usedGlobally = false; // in a table or export
55+
56+
bool worthInlining(PassOptions& options, bool allowMultipleInliningsPerFunction) {
57+
// if it's big, it's just not worth doing (TODO: investigate more)
58+
if (size > FLEXIBLE_SIZE_LIMIT) return false;
59+
// if it has one use, then inlining it would likely reduce code size
60+
// since we are just moving code around, + optimizing, so worth it
61+
// if small enough that we are pretty sure its ok
62+
if (calls == 1 && !usedGlobally && size <= CAREFUL_SIZE_LIMIT) return true;
63+
if (!allowMultipleInliningsPerFunction) return false;
64+
// more than one use, so we can't eliminate it after inlining,
65+
// so only worth it if we really care about speed and don't care
66+
// about size, and if it's lightweight so a good candidate for
67+
// speeding us up
68+
return options.optimizeLevel >= 3 && options.shrinkLevel == 0 && lightweight;
69+
}
70+
};
3871

39-
// We only inline a function with a single use.
40-
static const int SINGLE_USE = 1;
72+
typedef std::unordered_map<Name, FunctionInfo> NameInfoMap;
4173

42-
// A number of uses of a function that is too high for us to
43-
// inline it to all those locations.
44-
static const int TOO_MANY_USES_TO_INLINE = SINGLE_USE + 1;
74+
struct FunctionInfoScanner : public WalkerPass<PostWalker<FunctionInfoScanner>> {
75+
bool isFunctionParallel() override { return true; }
4576

46-
// Map of function name => number of uses. We build the values in
47-
// parallel, using atomic increments. This is safe because we never
48-
// update the map itself in parallel, we only update the values,
49-
// and so the map never allocates or moves values which could be
50-
// a problem with atomics (in fact it would be a problem in general
51-
// as well, not just with atomics, as we don't use a lock in
52-
// parallel access, we depend on the map itself being constant
53-
// when running multiple threads).
54-
typedef std::map<Name, std::atomic<Index>> NameToAtomicIndexMap;
77+
FunctionInfoScanner(NameInfoMap* infos) : infos(infos) {}
5578

56-
struct FunctionUseCounter : public WalkerPass<PostWalker<FunctionUseCounter>> {
57-
bool isFunctionParallel() override { return true; }
79+
FunctionInfoScanner* create() override {
80+
return new FunctionInfoScanner(infos);
81+
}
5882

59-
FunctionUseCounter(NameToAtomicIndexMap* uses) : uses(uses) {}
83+
void visitLoop(Loop* curr) {
84+
// having a loop is not lightweight
85+
(*infos)[getFunction()->name].lightweight = false;
86+
}
6087

61-
FunctionUseCounter* create() override {
62-
return new FunctionUseCounter(uses);
88+
void visitCall(Call* curr) {
89+
assert(infos->count(curr->target) > 0); // can't add a new element in parallel
90+
(*infos)[curr->target].calls++;
91+
// having a call is not lightweight
92+
(*infos)[getFunction()->name].lightweight = false;
6393
}
6494

65-
void visitCall(Call *curr) {
66-
assert(uses->count(curr->target) > 0); // can't add a new element in parallel
67-
(*uses)[curr->target]++;
95+
void visitFunction(Function* curr) {
96+
(*infos)[curr->name].size = Measurer::measure(curr->body);
6897
}
6998

7099
private:
71-
NameToAtomicIndexMap* uses;
100+
NameInfoMap* infos;
72101
};
73102

74103
struct InliningAction {
@@ -79,8 +108,8 @@ struct InliningAction {
79108
};
80109

81110
struct InliningState {
82-
std::set<Name> canInline;
83-
std::map<Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
111+
std::unordered_set<Name> worthInlining;
112+
std::unordered_map<Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
84113
};
85114

86115
struct Planner : public WalkerPass<PostWalker<Planner>> {
@@ -95,7 +124,7 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
95124
void visitCall(Call* curr) {
96125
// plan to inline if we know this is valid to inline, and if the call is
97126
// actually performed - if it is dead code, it's pointless to inline
98-
if (state->canInline.count(curr->target) &&
127+
if (state->worthInlining.count(curr->target) &&
99128
curr->type != unreachable) {
100129
// nest the call in a block. that way the location of the pointer to the call will not
101130
// change even if we inline multiple times into the same function, otherwise
@@ -110,7 +139,7 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
110139
void doWalkFunction(Function* func) {
111140
// we shouldn't inline into us if we are to be inlined
112141
// ourselves - that has the risk of cycles
113-
if (state->canInline.count(func->name) == 0) {
142+
if (state->worthInlining.count(func->name) == 0) {
114143
walk(func->body);
115144
}
116145
}
@@ -169,33 +198,43 @@ struct Inlining : public Pass {
169198
// whether to optimize where we inline
170199
bool optimize = false;
171200

172-
NameToAtomicIndexMap uses;
201+
NameInfoMap infos;
202+
203+
bool firstIteration;
173204

174205
void run(PassRunner* runner, Module* module) override {
175206
// keep going while we inline, to handle nesting. TODO: optimize
176-
calculateUses(module);
177-
while (iteration(runner, module)) {}
207+
firstIteration = true;
208+
while (1) {
209+
calculateInfos(module);
210+
if (!iteration(runner, module)) {
211+
return;
212+
}
213+
firstIteration = false;
214+
}
178215
}
179216

180-
void calculateUses(Module* module) {
181-
// fill in uses, as we operate on it in parallel (each function to its own entry)
217+
void calculateInfos(Module* module) {
218+
infos.clear();
219+
// fill in info, as we operate on it in parallel (each function to its own entry)
182220
for (auto& func : module->functions) {
183-
uses[func->name].store(0);
221+
infos[func->name];
184222
}
185223
PassRunner runner(module);
186224
runner.setIsNested(true);
187-
runner.add<FunctionUseCounter>(&uses);
225+
runner.add<FunctionInfoScanner>(&infos);
188226
runner.run();
227+
// fill in global uses
189228
// anything exported or used in a table should not be inlined
190229
for (auto& ex : module->exports) {
191230
if (ex->kind == ExternalKind::Function) {
192-
uses[ex->value].store(TOO_MANY_USES_TO_INLINE);
231+
infos[ex->value].usedGlobally = true;
193232
}
194233
}
195234
for (auto& segment : module->table.segments) {
196235
for (auto name : segment.data) {
197236
if (module->getFunctionOrNull(name)) {
198-
uses[name].store(TOO_MANY_USES_TO_INLINE);
237+
infos[name].usedGlobally = true;
199238
}
200239
}
201240
}
@@ -205,12 +244,12 @@ struct Inlining : public Pass {
205244
// decide which to inline
206245
InliningState state;
207246
for (auto& func : module->functions) {
208-
auto name = func->name;
209-
auto numUses = uses[name].load();
210-
if (canInline(numUses) && worthInlining(module->getFunction(name))) {
211-
state.canInline.insert(name);
247+
// on the first iteration, allow multiple inlinings per function
248+
if (infos[func->name].worthInlining(runner->options, firstIteration /* allowMultipleInliningsPerFunction */)) {
249+
state.worthInlining.insert(func->name);
212250
}
213251
}
252+
if (state.worthInlining.size() == 0) return false;
214253
// fill in actionsForFunction, as we operate on it in parallel (each function to its own entry)
215254
for (auto& func : module->functions) {
216255
state.actionsForFunction[func->name];
@@ -222,17 +261,16 @@ struct Inlining : public Pass {
222261
runner.add<Planner>(&state);
223262
runner.run();
224263
}
225-
// perform inlinings
226-
std::set<Name> inlined;
227-
std::set<Function*> inlinedInto;
264+
// perform inlinings TODO: parallelize
265+
std::unordered_map<Name, Index> inlinedUses; // how many uses we inlined
266+
std::unordered_set<Function*> inlinedInto; // which functions were inlined into
228267
for (auto& func : module->functions) {
229268
for (auto& action : state.actionsForFunction[func->name]) {
230269
Name inlinedName = action.contents->name;
231270
doInlining(module, func.get(), action);
232-
inlined.insert(inlinedName);
271+
inlinedUses[inlinedName]++;
233272
inlinedInto.insert(func.get());
234-
uses[inlinedName]--;
235-
assert(uses[inlinedName].load() == 0);
273+
assert(inlinedUses[inlinedName] <= infos[inlinedName].calls);
236274
}
237275
}
238276
// anything we inlined into may now have non-unique label names, fix it up
@@ -242,26 +280,20 @@ struct Inlining : public Pass {
242280
if (optimize && inlinedInto.size() > 0) {
243281
doOptimize(inlinedInto, module, runner);
244282
}
245-
// remove functions that we managed to inline, their one use is gone
283+
// remove functions that we no longer need after inlining
246284
auto& funcs = module->functions;
247-
funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&inlined](const std::unique_ptr<Function>& curr) {
248-
return inlined.count(curr->name) > 0;
285+
funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&](const std::unique_ptr<Function>& curr) {
286+
auto name = curr->name;
287+
auto& info = infos[name];
288+
return inlinedUses.count(name) && inlinedUses[name] == info.calls && !info.usedGlobally;
249289
}), funcs.end());
250290
// return whether we did any work
251-
return inlined.size() > 0;
252-
}
253-
254-
bool canInline(int numUses) {
255-
return numUses == SINGLE_USE;
256-
}
257-
258-
bool worthInlining(Function* func) {
259-
return Measurer::measure(func->body) <= INLINING_SIZE_LIMIT;
291+
return inlinedUses.size() > 0;
260292
}
261293

262294
// Run useful optimizations after inlining, things like removing
263295
// unnecessary new blocks, sharing variables, etc.
264-
void doOptimize(std::set<Function*>& funcs, Module* module, PassRunner* parentRunner) {
296+
void doOptimize(std::unordered_set<Function*>& funcs, Module* module, PassRunner* parentRunner) {
265297
// save the full list of functions on the side
266298
std::vector<std::unique_ptr<Function>> all;
267299
all.swap(module->functions);

src/passes/pass.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ void PassRegistry::registerPasses() {
7272
registerPass("duplicate-function-elimination", "removes duplicate functions", createDuplicateFunctionEliminationPass);
7373
registerPass("extract-function", "leaves just one function (useful for debugging)", createExtractFunctionPass);
7474
registerPass("flatten-control-flow", "flattens out control flow to be only on blocks, not nested as expressions", createFlattenControlFlowPass);
75-
registerPass("inlining", "inlines functions (currently only ones with a single use)", createInliningPass);
76-
registerPass("inlining-optimizing", "inlines functions (currently only ones with a single use) and optimizes where we inlined", createInliningOptimizingPass);
75+
registerPass("inlining", "inlines functions", createInliningPass);
76+
registerPass("inlining-optimizing", "inlines functions and optimizes where we inlined", createInliningOptimizingPass);
7777
registerPass("legalize-js-interface", "legalizes i64 types on the import/export boundary", createLegalizeJSInterfacePass);
7878
registerPass("local-cse", "common subexpression elimination inside basic blocks", createLocalCSEPass);
7979
registerPass("log-execution", "instrument the build with logging of where execution goes", createLogExecutionPass);

0 commit comments

Comments
 (0)