Source code
Revision control
Copy as Markdown
Other Tools
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
import pathlib
import sys
from collections.abc import Iterable
import filters
sys.path.insert(0, str(pathlib.Path(__file__).parent))
from browsertime_pageload import PageloadSupport
from logger.logger import RaptorLogger
LOG = RaptorLogger(component="perftest-support-class")
METRIC_BLOCKLIST = [
"mean",
"median",
"geomean",
]
class MissingBenchmarkResultsError(Exception):
"""
This error is raised when the benchmark results from a test
run do not contain the `browsertime_benchmark` entry in the dict
of extra data.
"""
pass
class BenchmarkSupport(PageloadSupport):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.failed_tests = []
self.youtube_playback_failure = False
def setup_test(self, next_test, args):
super().setup_test(next_test, args)
if next_test.get("custom_data", False) == "true":
raise ValueError(
"Cannot use BenchmarkSupport class for custom data, a "
"new support class should be built for that use case."
)
def modify_command(self, cmd, test):
# Enable cpuTime, and wallclock-tracking metrics
cmd.extend(
[
"--browsertime.cpuTime_test",
"true",
"--browsertime.wallclock_tracking_test",
"true",
]
)
def handle_result(self, bt_result, raw_result, **kwargs):
"""Parse a result for the required results.
See base_python_support.py for what's expected from this method.
"""
# Each entry here is a separate cold pageload iteration (or browser cycle)
for custom_types in raw_result["extras"]:
browsertime_benchmark_results = custom_types.get("browsertime_benchmark")
if not browsertime_benchmark_results:
raise MissingBenchmarkResultsError(
"Could not find `browsertime_benchmark` entry "
"in the browsertime `extra` results"
)
for metric, values in browsertime_benchmark_results.items():
bt_result["measurements"].setdefault(metric, []).append(values)
if self.perfstats:
for cycle in raw_result["geckoPerfStats"]:
for metric in cycle:
bt_result["measurements"].setdefault(
"perfstat-" + metric, []
).append(cycle[metric])
def parseYoutubePlaybackPerformanceOutput(self, test):
"""Parse the metrics for the Youtube playback performance test.
For each video measured values for dropped and decoded frames will be
available from the benchmark site.
{u'PlaybackPerf.VP9.2160p60@2X': {u'droppedFrames': 1, u'decodedFrames': 796}
With each page cycle / iteration of the test multiple values can be present.
Raptor will calculate the percentage of dropped frames to decoded frames.
All those three values will then be emitted as separate sub tests.
"""
_subtests = {}
test_name = [
measurement
for measurement in test["measurements"].keys()
if "youtube-playback" in measurement
]
if len(test_name) > 0:
data = test["measurements"].get(test_name[0])
else:
raise Exception("No measurements found for youtube test!")
def create_subtest_entry(
name,
value,
unit=test["subtest_unit"],
lower_is_better=test["subtest_lower_is_better"],
):
# build a list of subtests and append all related replicates
if name not in _subtests:
# subtest not added yet, first pagecycle, so add new one
_subtests[name] = {
"name": name,
"unit": unit,
"lowerIsBetter": lower_is_better,
"replicates": [],
}
_subtests[name]["replicates"].append(value)
if self.subtest_alert_on is not None:
if name in self.subtest_alert_on:
LOG.info(
"turning on subtest alerting for measurement type: %s" % name
)
_subtests[name]["shouldAlert"] = True
for pagecycle in data:
for _sub, _value in pagecycle[0].items():
if _value["decodedFrames"] == 0:
self.failed_tests.append(
"%s test Failed. decodedFrames %s droppedFrames %s."
% (_sub, _value["decodedFrames"], _value["droppedFrames"])
)
try:
percent_dropped = (
float(_value["droppedFrames"]) / _value["decodedFrames"] * 100.0
)
except ZeroDivisionError:
# if no frames have been decoded the playback failed completely
percent_dropped = 100.0
# Remove the not needed "PlaybackPerf." prefix from each test
_sub = _sub.split("PlaybackPerf", 1)[-1]
if _sub.startswith("."):
_sub = _sub[1:]
# build a list of subtests and append all related replicates
create_subtest_entry(
"{}_decoded_frames".format(_sub),
_value["decodedFrames"],
lower_is_better=False,
)
create_subtest_entry(
"{}_dropped_frames".format(_sub), _value["droppedFrames"]
)
create_subtest_entry(
"{}_%_dropped_frames".format(_sub), percent_dropped
)
# Check if any youtube test failed and generate exception
if len(self.failed_tests) > 0:
self.youtube_playback_failure = True
vals = []
subtests = []
names = list(_subtests)
names.sort(reverse=True)
for name in names:
# pylint: disable=W1633
_subtests[name]["value"] = round(
float(filters.median(_subtests[name]["replicates"])), 2
)
subtests.append(_subtests[name])
# only include dropped_frames values, without the %_dropped_frames values
if name.endswith("X_dropped_frames"):
vals.append([_subtests[name]["value"], name])
return subtests, vals
def parseUnknown(self, test):
# Attempt to flatten whatever we've been given
# Dictionary keys will be joined by dashes, arrays represent
# represent "iterations"
_subtests = {}
if not isinstance(test["measurements"], dict):
raise Exception(
"Expected a dictionary with a single entry as the name of the test. "
"The value of this key should be the data."
)
for iteration in test["measurements"][list(test["measurements"].keys())[0]]:
flattened_metrics = None
for metric, value in (flattened_metrics or iteration).items():
if metric in METRIC_BLOCKLIST:
# TODO: Add an option in the test manifest for this
continue
if metric not in _subtests:
# subtest not added yet, first pagecycle, so add new one
_subtests[metric] = {
"unit": test["subtest_unit"],
"alertThreshold": float(test["alert_threshold"]),
"lowerIsBetter": test["subtest_lower_is_better"],
"name": metric,
"replicates": [],
}
updated_metric = value
if not isinstance(value, Iterable):
updated_metric = [value]
# pylint: disable=W1633
_subtests[metric]["replicates"].extend(
[round(x, 3) for x in updated_metric]
)
vals = []
subtests = []
names = list(_subtests)
names.sort(reverse=True)
summaries = {
"median": filters.median,
"mean": filters.mean,
"geomean": filters.geometric_mean,
}
for name in names:
summary_method = test.get("submetric_summary_method", "median")
_subtests[name]["value"] = round(
summaries[summary_method](_subtests[name]["replicates"]), 3
)
subtests.append(_subtests[name])
vals.append([_subtests[name]["value"], name])
return subtests, vals
def construct_summary(self, vals, testname, unit=None):
def _filter(vals, value=None):
if value is None:
return [i for i, j in vals]
return [i for i, j in vals if j == value]
if testname.startswith("raptor-v8_7"):
return 100 * filters.geometric_mean(_filter(vals))
if testname == "speedometer3":
score = None
for val, name in vals:
if name == "score":
score = val
if score is None:
raise Exception("Unable to find score for Speedometer 3")
return score
if "speedometer" in testname:
correctionFactor = 3
results = _filter(vals)
# speedometer has 16 tests, each of these are made of up 9 subtests
# and a sum of the 9 values. We receive 160 values, and want to use
# the 16 test values, not the sub test values.
if len(results) != 160:
raise Exception(
"Speedometer has 160 subtests, found: %s instead" % len(results)
)
results = results[9::10]
# pylint --py3k W1619
score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor
return score
if "stylebench" in testname:
correctionFactor = 3
results = _filter(vals)
# stylebench has 5 tests, each of these are made of up 5 subtests
#
# * Adding classes.
# * Removing classes.
# * Mutating attributes.
# * Adding leaf elements.
# * Removing leaf elements.
#
# which are made of two subtests each (sync/async) and repeated 5 times
# each, thus, the list here looks like:
#
# [Test name/Adding classes - 0/ Sync; <x>]
# [Test name/Adding classes - 0/ Async; <y>]
# [Test name/Adding classes - 0; <x> + <y>]
# [Test name/Removing classes - 0/ Sync; <x>]
# [Test name/Removing classes - 0/ Async; <y>]
# [Test name/Removing classes - 0; <x> + <y>]
# ...
# [Test name/Adding classes - 1 / Sync; <x>]
# [Test name/Adding classes - 1 / Async; <y>]
# [Test name/Adding classes - 1 ; <x> + <y>]
# ...
# [Test name/Removing leaf elements - 4; <x> + <y>]
# [Test name; <sum>] <- This is what we want.
#
# So, 5 (subtests) *
# 5 (repetitions) *
# 3 (entries per repetition (sync/async/sum)) =
# 75 entries for test before the sum.
#
# We receive 76 entries per test, which ads up to 380. We want to use
# the 5 test entries, not the rest.
if len(results) != 380:
raise Exception(
"StyleBench requires 380 entries, found: %s instead" % len(results)
)
results = results[75::76]
# pylint --py3k W1619
return 60 * 1000 / filters.geometric_mean(results) / correctionFactor
if testname.startswith("raptor-kraken") or "sunspider" in testname:
return sum(_filter(vals))
if "unity-webgl" in testname or "webaudio" in testname:
# webaudio_score and unity_webgl_score: self reported as 'Geometric Mean'
return filters.mean(_filter(vals, "Geometric Mean"))
if "assorted-dom" in testname:
# pylint: disable=W1633
return round(filters.geometric_mean(_filter(vals)), 2)
if "wasm-misc" in testname:
# wasm_misc_score: self reported as '__total__'
return filters.mean(_filter(vals, "__total__"))
if "wasm-godot" in testname:
# wasm_godot_score: first-interactive mean
return filters.mean(_filter(vals, "first-interactive"))
if "youtube-playback" in testname:
# pylint: disable=W1633
return round(filters.mean(_filter(vals)), 2)
if "twitch-animation" in testname:
return round(filters.geometric_mean(_filter(vals, "run")), 2)
if testname.startswith("supporting_data"):
if not unit:
return sum(_filter(vals))
if unit == "%":
return filters.mean(_filter(vals))
if unit in ("W", "MHz"):
# For power in Watts and clock frequencies,
# summarize with the sum of the averages
allavgs = []
for val, subtest in vals:
if "avg" in subtest:
allavgs.append(val)
if allavgs:
return sum(allavgs)
raise Exception(
"No average measurements found for supporting data with W, or MHz unit ."
)
if unit in ["KB", "mAh", "mWh"]:
return sum(_filter(vals))
raise NotImplementedError("Unit %s not suported" % unit)
if len(vals) > 1:
# pylint: disable=W1633
return round(filters.geometric_mean(_filter(vals)), 2)
# pylint: disable=W1633
return round(filters.mean(_filter(vals)), 2)
def _process_measurements(self, suite, test, measurement_name, replicates):
subtest = {}
subtest["name"] = measurement_name
subtest["lowerIsBetter"] = test["subtest_lower_is_better"]
subtest["alertThreshold"] = float(test["alert_threshold"])
unit = test["subtest_unit"]
if measurement_name == "cpuTime":
unit = "ms"
elif measurement_name == "powerUsage":
unit = "uWh"
subtest["unit"] = unit
# Add the alert window settings if needed here too in case
# there is no summary value in the test
for schema_name in (
"minBackWindow",
"maxBackWindow",
"foreWindow",
):
if suite.get(schema_name, None) is not None:
subtest[schema_name] = suite[schema_name]
# if 'alert_on' is set for this particular measurement, then we want to set
# the flag in the perfherder output to turn on alerting for this subtest
if self.subtest_alert_on is not None:
if measurement_name in self.subtest_alert_on:
LOG.info(
"turning on subtest alerting for measurement type: %s"
% measurement_name
)
subtest["shouldAlert"] = True
if self.app in (
"chrome",
"chrome-m",
"custom-car",
"cstm-car-m",
):
subtest["shouldAlert"] = False
else:
# Explicitly set `shouldAlert` to False so that the measurement
# is not alerted on. Otherwise Perfherder defaults to alerting.
LOG.info(
"turning off subtest alerting for measurement type: %s"
% measurement_name
)
subtest["shouldAlert"] = False
if self.power_test and measurement_name == "powerUsage":
subtest["shouldAlert"] = True
subtest["replicates"] = replicates
return subtest
def summarize_test(self, test, suite, **kwargs):
subtests = None
if "youtube-playback" in test["name"]:
subtests, vals = self.parseYoutubePlaybackPerformanceOutput(test)
else:
# Attempt to parse the unknown benchmark by flattening the
# given data and merging all the arrays of non-iterable
# data that fall under the same key.
# XXX Note that this is not fully implemented for the summary
# of the metric or test as we don't have a use case for that yet.
subtests, vals = self.parseUnknown(test)
if subtests is None:
raise Exception("No benchmark metrics found in browsertime results")
suite["subtests"] = subtests
self.add_additional_metrics(test, suite)
# summarize results for both benchmark type tests
if len(subtests) > 1:
suite["value"] = self.construct_summary(vals, testname=test["name"])
subtests.sort(key=lambda subtest: subtest["name"])
def summarize_suites(self, suites):
pass
def report_test_success(self):
if len(self.failed_tests) > 0:
LOG.warning("Some tests failed.")
if self.youtube_playback_failure:
for test in self.failed_tests:
LOG.warning("Youtube sub-test FAILED: %s" % test)
LOG.warning(
"Youtube playback sub-tests failed!!! "
"Not submitting results to perfherder!"
)
return False
return True