1717//
1818// Inlining.
1919//
20- // For now , this does a conservative inlining of all functions that have
20+ // By default , this does a conservative inlining of all functions that have
2121// exactly one use, and are fairly small. That should not increase code
2222// size, and may have speed benefits.
2323//
24+ // When opt level is 3+ (-O3 or above), we more aggressively inline
25+ // even functions with more than one use, that seem to be "lightweight"
26+ // (no loops or calls etc.), so inlining them may get rid of call overhead
27+ // that would be noticeable otherwise
28+ //
2429
2530#include < atomic>
2631
3338
3439namespace wasm {
3540
36- // A limit on how big a function to inline.
37- static const int INLINING_SIZE_LIMIT = 15 ;
41+ // A limit on how big a function to inline when being careful about size
42+ static const int CAREFUL_SIZE_LIMIT = 15 ;
43+
44+ // A limit on how big a function to inline when being more flexible. In
45+ // particular it's nice that with this limit we can inline the clamp
46+ // functions (i32s-div, f64-to-int, etc.), that can affect perf.
47+ static const int FLEXIBLE_SIZE_LIMIT = 20 ;
48+
49+ // Useful into on a function, helping us decide if we can inline it
50+ struct FunctionInfo {
51+ std::atomic<Index> calls;
52+ Index size;
53+ bool lightweight = true ;
54+ bool usedGlobally = false ; // in a table or export
55+
56+ bool worthInlining (PassOptions& options, bool allowMultipleInliningsPerFunction) {
57+ // if it's big, it's just not worth doing (TODO: investigate more)
58+ if (size > FLEXIBLE_SIZE_LIMIT) return false ;
59+ // if it has one use, then inlining it would likely reduce code size
60+ // since we are just moving code around, + optimizing, so worth it
61+ // if small enough that we are pretty sure its ok
62+ if (calls == 1 && !usedGlobally && size <= CAREFUL_SIZE_LIMIT) return true ;
63+ if (!allowMultipleInliningsPerFunction) return false ;
64+ // more than one use, so we can't eliminate it after inlining,
65+ // so only worth it if we really care about speed and don't care
66+ // about size, and if it's lightweight so a good candidate for
67+ // speeding us up
68+ return options.optimizeLevel >= 3 && options.shrinkLevel == 0 && lightweight;
69+ }
70+ };
3871
39- // We only inline a function with a single use.
40- static const int SINGLE_USE = 1 ;
72+ typedef std::unordered_map<Name, FunctionInfo> NameInfoMap;
4173
42- // A number of uses of a function that is too high for us to
43- // inline it to all those locations.
44- static const int TOO_MANY_USES_TO_INLINE = SINGLE_USE + 1 ;
74+ struct FunctionInfoScanner : public WalkerPass <PostWalker<FunctionInfoScanner>> {
75+ bool isFunctionParallel () override { return true ; }
4576
46- // Map of function name => number of uses. We build the values in
47- // parallel, using atomic increments. This is safe because we never
48- // update the map itself in parallel, we only update the values,
49- // and so the map never allocates or moves values which could be
50- // a problem with atomics (in fact it would be a problem in general
51- // as well, not just with atomics, as we don't use a lock in
52- // parallel access, we depend on the map itself being constant
53- // when running multiple threads).
54- typedef std::map<Name, std::atomic<Index>> NameToAtomicIndexMap;
77+ FunctionInfoScanner (NameInfoMap* infos) : infos(infos) {}
5578
56- struct FunctionUseCounter : public WalkerPass <PostWalker<FunctionUseCounter>> {
57- bool isFunctionParallel () override { return true ; }
79+ FunctionInfoScanner* create () override {
80+ return new FunctionInfoScanner (infos);
81+ }
5882
59- FunctionUseCounter (NameToAtomicIndexMap* uses) : uses(uses) {}
83+ void visitLoop (Loop* curr) {
84+ // having a loop is not lightweight
85+ (*infos)[getFunction ()->name ].lightweight = false ;
86+ }
6087
61- FunctionUseCounter* create () override {
62- return new FunctionUseCounter (uses);
88+ void visitCall (Call* curr) {
89+ assert (infos->count (curr->target ) > 0 ); // can't add a new element in parallel
90+ (*infos)[curr->target ].calls ++;
91+ // having a call is not lightweight
92+ (*infos)[getFunction ()->name ].lightweight = false ;
6393 }
6494
65- void visitCall (Call *curr) {
66- assert (uses->count (curr->target ) > 0 ); // can't add a new element in parallel
67- (*uses)[curr->target ]++;
95+ void visitFunction (Function* curr) {
96+ (*infos)[curr->name ].size = Measurer::measure (curr->body );
6897 }
6998
7099private:
71- NameToAtomicIndexMap* uses ;
100+ NameInfoMap* infos ;
72101};
73102
74103struct InliningAction {
@@ -79,8 +108,8 @@ struct InliningAction {
79108};
80109
81110struct InliningState {
82- std::set <Name> canInline ;
83- std::map <Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
111+ std::unordered_set <Name> worthInlining ;
112+ std::unordered_map <Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
84113};
85114
86115struct Planner : public WalkerPass <PostWalker<Planner>> {
@@ -95,7 +124,7 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
95124 void visitCall (Call* curr) {
96125 // plan to inline if we know this is valid to inline, and if the call is
97126 // actually performed - if it is dead code, it's pointless to inline
98- if (state->canInline .count (curr->target ) &&
127+ if (state->worthInlining .count (curr->target ) &&
99128 curr->type != unreachable) {
100129 // nest the call in a block. that way the location of the pointer to the call will not
101130 // change even if we inline multiple times into the same function, otherwise
@@ -110,7 +139,7 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
110139 void doWalkFunction (Function* func) {
111140 // we shouldn't inline into us if we are to be inlined
112141 // ourselves - that has the risk of cycles
113- if (state->canInline .count (func->name ) == 0 ) {
142+ if (state->worthInlining .count (func->name ) == 0 ) {
114143 walk (func->body );
115144 }
116145 }
@@ -169,33 +198,43 @@ struct Inlining : public Pass {
169198 // whether to optimize where we inline
170199 bool optimize = false ;
171200
172- NameToAtomicIndexMap uses;
201+ NameInfoMap infos;
202+
203+ bool firstIteration;
173204
174205 void run (PassRunner* runner, Module* module ) override {
175206 // keep going while we inline, to handle nesting. TODO: optimize
176- calculateUses (module );
177- while (iteration (runner, module )) {}
207+ firstIteration = true ;
208+ while (1 ) {
209+ calculateInfos (module );
210+ if (!iteration (runner, module )) {
211+ return ;
212+ }
213+ firstIteration = false ;
214+ }
178215 }
179216
180- void calculateUses (Module* module ) {
181- // fill in uses, as we operate on it in parallel (each function to its own entry)
217+ void calculateInfos (Module* module ) {
218+ infos.clear ();
219+ // fill in info, as we operate on it in parallel (each function to its own entry)
182220 for (auto & func : module ->functions ) {
183- uses [func->name ]. store ( 0 ) ;
221+ infos [func->name ];
184222 }
185223 PassRunner runner (module );
186224 runner.setIsNested (true );
187- runner.add <FunctionUseCounter >(&uses );
225+ runner.add <FunctionInfoScanner >(&infos );
188226 runner.run ();
227+ // fill in global uses
189228 // anything exported or used in a table should not be inlined
190229 for (auto & ex : module ->exports ) {
191230 if (ex->kind == ExternalKind::Function) {
192- uses [ex->value ].store (TOO_MANY_USES_TO_INLINE) ;
231+ infos [ex->value ].usedGlobally = true ;
193232 }
194233 }
195234 for (auto & segment : module ->table .segments ) {
196235 for (auto name : segment.data ) {
197236 if (module ->getFunctionOrNull (name)) {
198- uses [name].store (TOO_MANY_USES_TO_INLINE) ;
237+ infos [name].usedGlobally = true ;
199238 }
200239 }
201240 }
@@ -205,12 +244,12 @@ struct Inlining : public Pass {
205244 // decide which to inline
206245 InliningState state;
207246 for (auto & func : module ->functions ) {
208- auto name = func->name ;
209- auto numUses = uses[name].load ();
210- if (canInline (numUses) && worthInlining (module ->getFunction (name))) {
211- state.canInline .insert (name);
247+ // on the first iteration, allow multiple inlinings per function
248+ if (infos[func->name ].worthInlining (runner->options , firstIteration /* allowMultipleInliningsPerFunction */ )) {
249+ state.worthInlining .insert (func->name );
212250 }
213251 }
252+ if (state.worthInlining .size () == 0 ) return false ;
214253 // fill in actionsForFunction, as we operate on it in parallel (each function to its own entry)
215254 for (auto & func : module ->functions ) {
216255 state.actionsForFunction [func->name ];
@@ -222,17 +261,16 @@ struct Inlining : public Pass {
222261 runner.add <Planner>(&state);
223262 runner.run ();
224263 }
225- // perform inlinings
226- std::set <Name> inlined;
227- std::set <Function*> inlinedInto;
264+ // perform inlinings TODO: parallelize
265+ std::unordered_map <Name, Index> inlinedUses; // how many uses we inlined
266+ std::unordered_set <Function*> inlinedInto; // which functions were inlined into
228267 for (auto & func : module ->functions ) {
229268 for (auto & action : state.actionsForFunction [func->name ]) {
230269 Name inlinedName = action.contents ->name ;
231270 doInlining (module , func.get (), action);
232- inlined. insert ( inlinedName) ;
271+ inlinedUses[ inlinedName]++ ;
233272 inlinedInto.insert (func.get ());
234- uses[inlinedName]--;
235- assert (uses[inlinedName].load () == 0 );
273+ assert (inlinedUses[inlinedName] <= infos[inlinedName].calls );
236274 }
237275 }
238276 // anything we inlined into may now have non-unique label names, fix it up
@@ -242,26 +280,20 @@ struct Inlining : public Pass {
242280 if (optimize && inlinedInto.size () > 0 ) {
243281 doOptimize (inlinedInto, module , runner);
244282 }
245- // remove functions that we managed to inline, their one use is gone
283+ // remove functions that we no longer need after inlining
246284 auto & funcs = module ->functions ;
247- funcs.erase (std::remove_if (funcs.begin (), funcs.end (), [&inlined](const std::unique_ptr<Function>& curr) {
248- return inlined.count (curr->name ) > 0 ;
285+ funcs.erase (std::remove_if (funcs.begin (), funcs.end (), [&](const std::unique_ptr<Function>& curr) {
286+ auto name = curr->name ;
287+ auto & info = infos[name];
288+ return inlinedUses.count (name) && inlinedUses[name] == info.calls && !info.usedGlobally ;
249289 }), funcs.end ());
250290 // return whether we did any work
251- return inlined.size () > 0 ;
252- }
253-
254- bool canInline (int numUses) {
255- return numUses == SINGLE_USE;
256- }
257-
258- bool worthInlining (Function* func) {
259- return Measurer::measure (func->body ) <= INLINING_SIZE_LIMIT;
291+ return inlinedUses.size () > 0 ;
260292 }
261293
262294 // Run useful optimizations after inlining, things like removing
263295 // unnecessary new blocks, sharing variables, etc.
264- void doOptimize (std::set <Function*>& funcs, Module* module , PassRunner* parentRunner) {
296+ void doOptimize (std::unordered_set <Function*>& funcs, Module* module , PassRunner* parentRunner) {
265297 // save the full list of functions on the side
266298 std::vector<std::unique_ptr<Function>> all;
267299 all.swap (module ->functions );
0 commit comments