Source code
Revision control
Copy as Markdown
Other Tools
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
#ifndef wasm_WasmHeuristics_h
#define wasm_WasmHeuristics_h
#include <math.h>
#include "js/Prefs.h"
#include "threading/ExclusiveData.h"
#include "vm/MutexIDs.h"
namespace js {
namespace wasm {
// Classes LazyTieringHeuristics and InliningHeuristics allow answering of
// simple questions relating to lazy tiering and inlining, eg, "is this
// function small enough to inline?" They do not answer questions that involve
// carrying state (eg, remaining inlining budget) across multiple queries.
//
// Note also, they may be queried in parallel without locking, by multiple
// instantiating / compilation threads, and so must be immutable once created.
// For both LazyTieringHeuristics and InliningHeuristics, the default `level_`
// is set to 5 in modules/libpref/init/StaticPrefList.yaml. The scaling
// factors and tables defined in this file have been set so as to give
// near-optimal performance on Barista-3 and another benchmark; they are
// generally within 2% of the best value that can be found by changing the
// `level_` numbers. Further performance gains may depend on improving the
// accuracy of estimateIonCompilationCost().
//
// Performance was measured on a mid/high-end Intel CPU (Core i5-1135G7 --
// Tiger Lake) and a low end Intel (Celeron N3050 -- Goldmont).
class LazyTieringHeuristics {
static constexpr uint32_t MIN_LEVEL = 1;
static constexpr uint32_t MAX_LEVEL = 9;
// A scaling table for levels 2 .. 8. Levels 1 and 9 are special-cased. In
// this table, each value differs from its neighbour by a factor of 3, giving
// a dynamic range in the table of 3 ^ 6 == 729, hence a wide selection of
// tier-up aggressiveness.
static constexpr float scale_[7] = {27.0, 9.0, 3.0,
1.0, // default
0.333, 0.111, 0.037};
public:
// 1 = min (almost never, set tiering threshold to max possible, == 2^31-1)
// 5 = default
// 9 = max (request tier up at first call, set tiering threshold to zero)
//
// Don't use this directly, except for logging etc.
static uint32_t rawLevel() {
uint32_t level = JS::Prefs::wasm_lazy_tiering_level();
return std::clamp(level, MIN_LEVEL, MAX_LEVEL);
}
// Estimate the cost of compiling a function of bytecode size `bodyLength`
// using Ion, in terms of arbitrary work-units. The baseline code for the
// function counts down from the returned value as it runs. When the value
// goes negative it requests tier-up. See "[SMDOC] WebAssembly baseline
// compiler -- Lazy Tier-Up mechanism" in WasmBaselineCompile.cpp.
static int32_t estimateIonCompilationCost(uint32_t bodyLength) {
uint32_t level = rawLevel();
if (MOZ_LIKELY(MIN_LEVEL < level && level < MAX_LEVEL)) {
// The estimated cost, in X86_64 insns, for Ion compilation:
// 30k up-front cost + 4k per bytecode byte.
//
// This is derived from measurements of an optimized build of Ion
// compiling about 99000 functions. Each estimate is pretty bad, but
// averaged over a number of functions it's often within 20% of correct.
// However, this is with no inlining; that causes a much wider variance
// of costs. This will need to be revisited at some point.
float thresholdF = 30000.0 + 4000.0 * float(bodyLength);
// Rescale to step-down work units, so that the default `level` setting
// (5) gives pretty good results.
thresholdF *= 0.25;
// Rescale again to take into account `level`.
thresholdF *= scale_[level - (MIN_LEVEL + 1)];
// Clamp and convert.
constexpr float thresholdHigh = 2.0e9f; // at most 2 billion;
int32_t thresholdI = int32_t(std::clamp(thresholdF, 10.f, thresholdHigh));
MOZ_RELEASE_ASSERT(thresholdI >= 0);
return thresholdI;
}
if (level == MIN_LEVEL) {
// "almost never tier up"; produce our closest approximation to infinity
return INT32_MAX;
}
if (level == MAX_LEVEL) {
// request tier up at the first call; return the lowest possible value
return 0;
}
MOZ_CRASH();
}
};
class InliningHeuristics {
static constexpr uint32_t MIN_LEVEL = 1;
static constexpr uint32_t MAX_LEVEL = 9;
public:
// 1 = no inlining allowed
// 2 = min (minimal inlining)
// 5 = default
// 9 = max (very aggressive inlining)
//
// Don't use these directly, except for logging etc.
static uint32_t rawLevel() {
uint32_t level = JS::Prefs::wasm_inlining_level();
return std::clamp(level, MIN_LEVEL, MAX_LEVEL);
}
static bool rawDirectAllowed() { return JS::Prefs::wasm_direct_inlining(); }
static bool rawCallRefAllowed() {
return JS::Prefs::wasm_call_ref_inlining();
}
// For a call_ref site, returns the percentage of total calls made by that
// site, that any single target has to make in order to be considered as a
// candidate for speculative inlining.
static uint32_t rawCallRefPercent() {
uint32_t percent = JS::Prefs::wasm_call_ref_inlining_percent();
// Clamp to range 10 .. 100 (%).
return std::clamp(percent, 10u, 100u);
}
// Given a call of kind `callKind` to a function of bytecode size
// `bodyLength` at `inliningDepth`, decide whether the it is allowable to
// inline the call. Note that `inliningDepth` starts at zero, not one. In
// other words, a value of zero means the query relates to a function which
// (if approved) would be inlined into the top-level function currently being
// compiled.
enum class CallKind { Direct, CallRef };
static bool isSmallEnoughToInline(CallKind callKind, uint32_t inliningDepth,
uint32_t bodyLength) {
// If this fails, something's seriously wrong; bail out.
MOZ_RELEASE_ASSERT(inliningDepth <= 10); // because 10 > (400 / 50)
// Check whether calls of this kind are currently allowed
if ((callKind == CallKind::Direct && !rawDirectAllowed()) ||
(callKind == CallKind::CallRef && !rawCallRefAllowed())) {
return false;
}
// Check the size is allowable. This depends on how deep we are in the
// stack and on the setting of level_. We allow inlining of functions of
// size up to the `baseSize[]` value at depth zero, but reduce the
// allowable size by 50 for each further level of inlining, so that only
// smaller and smaller functions are allowed as we inline deeper.
//
// At some point `allowedSize` goes negative and thereby disallows all
// further inlining. Note that the `baseSize` entry for
// `level_ == MIN_LEVEL (== 1)` is set so as to disallow inlining even at
// depth zero. Hence `level_ == MIN_LEVEL` disallows all inlining.
static constexpr int32_t baseSize[9] = {0, 50, 100, 150,
200, // default
250, 300, 350, 400};
uint32_t level = rawLevel();
MOZ_RELEASE_ASSERT(level >= MIN_LEVEL && level <= MAX_LEVEL);
int32_t allowedSize = baseSize[level - MIN_LEVEL];
allowedSize -= int32_t(50 * inliningDepth);
return allowedSize > 0 && bodyLength <= uint32_t(allowedSize);
}
};
// [SMDOC] Per-function and per-module inlining limits
// `class InliningHeuristics` makes inlining decisions on a per-call-site
// basis. Even with that in place, it is still possible to create a small
// input function for which inlining produces a huge (1000 x) expansion. Hence
// we also need a backstop mechanism to limit growth of functions and of
// modules as a whole.
//
// The following scheme is therefore implemented:
//
// * no function can have an inlining-based expansion of more than a constant
// factor (eg, 9 x).
//
// * for a module as a whole there is also a max expansion factor, and this is
// much lower, perhaps 1 x.
//
// This means that
//
// * no individual function can cause too much trouble (due to the 9 x limit),
// yet any function that needs a lot of inlining can still get it. In
// practice most functions have an inlining expansion, at default settings,
// of much less than 5 x.
//
// * the module as a whole cannot chew up excessive resources.
//
// Once a limit is exhausted, Ion compilation is still possible, but no
// inlining will be done.
//
// The per-module limit needs to be interpreted in the light of lazy tiering.
// Many modules only tier up a small subset of their functions. Hence the
// relatively low per-module limit still allows a high level of expansion of
// the functions that do get tiered up.
//
// In effect, the tiering mechanism gives hot functions (early tierer-uppers)
// preferential access to the module-level inlining budget. Colder functions
// that tier up later may find the budget to be exhausted, in which case they
// get no inlining. It would be feasible to gradually reduce inlining
// aggressiveness as the budget is used up, rather than have cliff-edge
// behaviour, but it hardly seems worth the hassle.
//
// To implement this, we have
//
// * `int64_t WasmCodeMetadata::ProtectedOptimizationStats::inliningBudget`:
// this is initially set as the maximum copied-in bytecode length allowable
// for the module. Inlining of individual call sites decreases the value and
// may drive it negative. Once the value is negative, no more inlining is
// allowed.
//
// * `int64_t FunctionCompiler::inliningBudget_` does the same at a
// per-function level. Its initial value takes into account the current
// value of the module-level budget; hence if the latter is exhausted, the
// function-level budget will be zero and so no inlining occurs.
//
// If either limit is exceeded, a message is printed on the
// `MOZ_LOG=wasmCodeMetaStats:3` channel.
// Allowing budgets to be driven negative means we slightly overshoot them. An
// alternative to be to ensure they can never be driven negative, in which case
// we will slightly undershoot them instead, given that the sum of inlined
// function sizes is unlikely to exactly match the budget. We use the
// overshoot scheme only because it makes it simple to decide when to log a
// budget-overshoot message and not emit any duplicates.
// There is a (logical, not-TSan-detectable) race condition in that the
// inlining budget for a function is set in part from the module-level budget
// at the time that compilation of the function begins, and the module-level
// budget is updated when compilation of a function ends -- see
// FunctionCompiler::initToplevel and ::finish. If there are multiple
// compilation threads, it can happen that multiple threads individually
// overrun the module-level budget, and so collectively overshoot the budget
// multiple times.
//
// The worst-case total overshoot is equal to the worst-case per-function
// overshoot multiplied by the max number of functions that can be concurrently
// compiled:
//
// <max per-function overshoot, which
// == the largest body length that can be accepted
// by InliningHeuristics::isSmallEnoughToInline>
// * MaxPartialTier2CompileTasks
//
// which with current settings is 400 * 1 == 400.
//
// We never expect to hit either limit in normal operation -- they exist only
// to protect against the worst case. So the imprecision doesn't matter.
// Setting the multiplier here to 1 means that inlining can copy in at maximum
// the same amount of bytecode as is in the module; 2 means twice as much, etc,
// and setting it to 0 would completely disable inlining.
static constexpr int64_t PerModuleMaxInliningRatio = 1;
// Same meaning as above, except at a per-function level.
static constexpr int64_t PerFunctionMaxInliningRatio = 9;
} // namespace wasm
} // namespace js
#endif /* wasm_WasmHeuristics_h */