browsertime_benchmark.py

mozilla-central/testing/raptor/browsertime/support-scripts/browsertime_benchmark.py (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Testing :: Raptor

Revision control

Copy as Markdown

Other Tools

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import json

import pathlib

import sys

from collections.abc import Iterable

import filters

sys.path.insert(0, str(pathlib.Path(__file__).parent))

from browsertime_pageload import PageloadSupport

from logger.logger import RaptorLogger

LOG = RaptorLogger(component="perftest-support-class")

METRIC_BLOCKLIST = [

    "mean",

    "median",

    "geomean",

class MissingBenchmarkResultsError(Exception):

"""

    This error is raised when the benchmark results from a test

    run do not contain the `browsertime_benchmark` entry in the dict

    of extra data.

"""

    pass

class BenchmarkSupport(PageloadSupport):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.failed_tests = []

        self.youtube_playback_failure = False

    def setup_test(self, next_test, args):

        super().setup_test(next_test, args)

        if next_test.get("custom_data", False) == "true":

            raise ValueError(

                "Cannot use BenchmarkSupport class for custom data, a "

                "new support class should be built for that use case."

    def modify_command(self, cmd, test):

        # Enable cpuTime, and wallclock-tracking metrics

        cmd.extend(

                "--browsertime.cpuTime_test",

                "true",

                "--browsertime.wallclock_tracking_test",

                "true",

    def handle_result(self, bt_result, raw_result, **kwargs):

        """Parse a result for the required results.

        See base_python_support.py for what's expected from this method.

"""

        # Each entry here is a separate cold pageload iteration (or browser cycle)

        for custom_types in raw_result["extras"]:

            browsertime_benchmark_results = custom_types.get("browsertime_benchmark")

            if not browsertime_benchmark_results:

                raise MissingBenchmarkResultsError(

                    "Could not find `browsertime_benchmark` entry "

                    "in the browsertime `extra` results"

            for metric, values in browsertime_benchmark_results.items():

                bt_result["measurements"].setdefault(metric, []).append(values)

        if self.perfstats:

            for cycle in raw_result["geckoPerfStats"]:

                for metric in cycle:

                    bt_result["measurements"].setdefault(

                        "perfstat-" + metric, []

                    ).append(cycle[metric])

    def parseYoutubePlaybackPerformanceOutput(self, test):

        """Parse the metrics for the Youtube playback performance test.

        For each video measured values for dropped and decoded frames will be

        available from the benchmark site.

        {u'PlaybackPerf.VP9.2160p60@2X': {u'droppedFrames': 1, u'decodedFrames': 796}

        With each page cycle / iteration of the test multiple values can be present.

        Raptor will calculate the percentage of dropped frames to decoded frames.

        All those three values will then be emitted as separate sub tests.

"""

        _subtests = {}

        test_name = [

            measurement

            for measurement in test["measurements"].keys()

            if "youtube-playback" in measurement

        if len(test_name) > 0:

            data = test["measurements"].get(test_name[0])

        else:

            raise Exception("No measurements found for youtube test!")

        def create_subtest_entry(

            name,

            value,

            unit=test["subtest_unit"],

            lower_is_better=test["subtest_lower_is_better"],

):

            # build a list of subtests and append all related replicates

            if name not in _subtests:

                # subtest not added yet, first pagecycle, so add new one

                _subtests[name] = {

                    "name": name,

                    "unit": unit,

                    "lowerIsBetter": lower_is_better,

                    "replicates": [],

            _subtests[name]["replicates"].append(value)

            if self.subtest_alert_on is not None:

                if name in self.subtest_alert_on:

                    LOG.info(

                        "turning on subtest alerting for measurement type: %s" % name

                    _subtests[name]["shouldAlert"] = True

        for pagecycle in data:

            for _sub, _value in pagecycle[0].items():

                if _value["decodedFrames"] == 0:

                    self.failed_tests.append(

                        "%s test Failed. decodedFrames %s droppedFrames %s."

                        % (_sub, _value["decodedFrames"], _value["droppedFrames"])

                try:

                    percent_dropped = (

                        float(_value["droppedFrames"]) / _value["decodedFrames"] * 100.0

                except ZeroDivisionError:

                    # if no frames have been decoded the playback failed completely

                    percent_dropped = 100.0

                # Remove the not needed "PlaybackPerf." prefix from each test

                _sub = _sub.split("PlaybackPerf", 1)[-1]

                if _sub.startswith("."):

                    _sub = _sub[1:]

                # build a list of subtests and append all related replicates

                create_subtest_entry(

                    f"{_sub}_decoded_frames",

                    _value["decodedFrames"],

                    lower_is_better=False,

                create_subtest_entry(f"{_sub}_dropped_frames", _value["droppedFrames"])

                create_subtest_entry(f"{_sub}_%_dropped_frames", percent_dropped)

        # Check if any youtube test failed and generate exception

        if len(self.failed_tests) > 0:

            self.youtube_playback_failure = True

        vals = []

        subtests = []

        names = list(_subtests)

        names.sort(reverse=True)

        for name in names:

            # pylint: disable=W1633

            _subtests[name]["value"] = round(

                float(filters.median(_subtests[name]["replicates"])), 2

            subtests.append(_subtests[name])

            # only include dropped_frames values, without the %_dropped_frames values

            if name.endswith("X_dropped_frames"):

                vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseWebCodecsOutput(self, test):

"""

        Example output (this is one page cycle):

            'name': 'webcodecs',

            'type': 'benchmark',

            'measurements': {

            'webcodecs': [

['{

                    "vp8 realtime encode": {

                        "frame-to-frame mean (key)": {"value":5.222857,"unit":"ms"},

                        "frame-to-frame cv (key)":{"value":27.052957,"unit":"%"},

                        "frame-dropping rate (key)":{"value":0,"unit":"%"},

                        "frame-to-frame mean (non key)":{"value":1.460678,"unit":"ms"},

                        "frame-to-frame cv (non key)":{"value":65.4360136,"unit":"%"},

                        "frame-dropping rate (non key)":{"value":0,"unit":"%"}

                }'],

...

},

            'lower_is_better': False,

            'unit': 'score'

"""

        data = test["measurements"]["webcodecs"]

        results = {}

        for page_cycle in data:

            d = json.loads(page_cycle[0])

            for test_name, test_data in d.items():

                results.setdefault(test_name, []).append(test_data)

        _subtests = {}

        for test_name in results:

            for result in results[test_name]:

                for subtest_name, subtest_result in result.items():

                    subtest_result_name = f"{test_name} - {subtest_name}"

                    _subtests.setdefault(

                        subtest_result_name,

                            "unit": subtest_result["unit"],

                            "alertThreshold": float(test["alert_threshold"]),

                            "lowerIsBetter": test["subtest_lower_is_better"],

                            "name": subtest_result_name,

                            "replicates": [],

                            "shouldAlert": True,

},

                    )["replicates"].append(subtest_result["value"])

            for subtest_name in results[test_name]:

                for subtest_name in result:

                    subtest_result_name = f"{test_name} - {subtest_name}"

                    _subtests[subtest_result_name]["value"] = filters.median(

                        _subtests[subtest_result_name]["replicates"]

        subtests = sorted(_subtests.values(), key=lambda x: x["name"], reverse=True)

        for subtest in subtests:

            if isinstance(subtest["value"], float):

                subtest["value"] = round(subtest["value"], 3)

        vals = [[subtest["value"], subtest["name"]] for subtest in subtests]

        return subtests, vals

    def parseUnknown(self, test):

        # Attempt to flatten whatever we've been given

        # Dictionary keys will be joined by dashes, arrays represent

        # represent "iterations"

        _subtests = {}

        if not isinstance(test["measurements"], dict):

            raise Exception(

                "Expected a dictionary with a single entry as the name of the test. "

                "The value of this key should be the data."

        for iteration in test["measurements"][list(test["measurements"].keys())[0]]:

            flattened_metrics = None

            for metric, value in (flattened_metrics or iteration).items():

                if metric in METRIC_BLOCKLIST:

                    # TODO: Add an option in the test manifest for this

                    continue

                if metric not in _subtests:

                    # subtest not added yet, first pagecycle, so add new one

                    _subtests[metric] = {

                        "unit": test["subtest_unit"],

                        "alertThreshold": float(test["alert_threshold"]),

                        "lowerIsBetter": test["subtest_lower_is_better"],

                        "name": metric,

                        "replicates": [],

                updated_metric = value

                if not isinstance(value, Iterable):

                    updated_metric = [value]

                # pylint: disable=W1633

                _subtests[metric]["replicates"].extend(

                    [round(x, 3) for x in updated_metric]

        vals = []

        subtests = []

        names = list(_subtests)

        names.sort(reverse=True)

        summaries = {

            "median": filters.median,

            "mean": filters.mean,

            "geomean": filters.geometric_mean,

        for name in names:

            summary_method = test.get("submetric_summary_method", "median")

            _subtests[name]["value"] = round(

                summaries[summary_method](_subtests[name]["replicates"]), 3

            subtests.append(_subtests[name])

            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def construct_summary(self, vals, testname, unit=None):

        def _filter(vals, value=None):

            if value is None:

                return [i for i, j in vals]

            return [i for i, j in vals if j == value]

        if testname.startswith("raptor-v8_7"):

            return 100 * filters.geometric_mean(_filter(vals))

        if testname == "speedometer3":

            score = None

            for val, name in vals:

                if name == "score":

                    score = val

            if score is None:

                raise Exception("Unable to find score for Speedometer 3")

            return score

        if "speedometer" in testname:

            correctionFactor = 3

            results = _filter(vals)

            # speedometer has 16 tests, each of these are made of up 9 subtests

            # and a sum of the 9 values.  We receive 160 values, and want to use

            # the 16 test values, not the sub test values.

            if len(results) != 160:

                raise Exception(

                    "Speedometer has 160 subtests, found: %s instead" % len(results)

            results = results[9::10]

            # pylint --py3k W1619

            score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor

            return score

        if "stylebench" in testname:

            # see https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888

            correctionFactor = 3

            results = _filter(vals)

            # stylebench has 6 tests. Five of them are made of up 5 subtests

            #   * Adding classes.

            #   * Removing classes.

            #   * Mutating attributes.

            #   * Adding leaf elements.

            #   * Removing leaf elements.

            # which are made of two subtests each (sync/async) and repeated 5 times

            # each, thus, the list here looks like:

            #   [Test name/Adding classes - 0/ Sync; <x>]

            #   [Test name/Adding classes - 0/ Async; <y>]

            #   [Test name/Adding classes - 0; <x> + <y>]

            #   [Test name/Removing classes - 0/ Sync; <x>]

            #   [Test name/Removing classes - 0/ Async; <y>]

            #   [Test name/Removing classes - 0; <x> + <y>]

            #   ...

            #   [Test name/Adding classes - 1 / Sync; <x>]

            #   [Test name/Adding classes - 1 / Async; <y>]

            #   [Test name/Adding classes - 1 ; <x> + <y>]

            #   ...

            #   [Test name/Removing leaf elements - 4; <x> + <y>]

            #   [Test name; <sum>] <- This is what we want.

            # So, 5 (subtests) *

            #     5 (repetitions) *

            #     3 (entries per repetition (sync/async/sum)) =

            #     75 entries for test before the sum.

            # We receive 76 entries per test, which ads up to 380. We want to use

            # the 5 test entries, not the rest.

            # Then there's the sixth "Dynamic media queries" test, which gives

            # results for viewports in increments of 50px like:

            #   Dynamic media queries/Resizing to 300px - 0/Sync

            #   Dynamic media queries/Resizing to 300px - 0/Async

            #   Dynamic media queries/Resizing to 300px - 0

            #   Dynamic media queries/Resizing to 350px - 0/Sync

            #   Dynamic media queries/Resizing to 350px - 0/Async

            #   Dynamic media queries/Resizing to 350px - 0

            #   ...

            #   Dynamic media queries/Resizing to 800px - 0/Sync

            #   Dynamic media queries/Resizing to 800px - 0/Async

            #   Dynamic media queries/Resizing to 800px - 0

            #   Dynamic media queries/Resizing to 350px - 1/Sync

            #   Dynamic media queries/Resizing to 350px - 1/Async

            #   Dynamic media queries/Resizing to 350px - 1

            #   Dynamic media queries/Resizing to 400px - 1/Sync

            #   Dynamic media queries/Resizing to 400px - 1/Async

            #   Dynamic media queries/Resizing to 400px - 1

            #   ...

            #   Dynamic media queries/Resizing to 800px - 4/Sync

            #   Dynamic media queries/Resizing to 800px - 4/Async

            #   Dynamic media queries/Resizing to 800px - 4

            #   Dynamic media queries <- What we want

            # So len([300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]) is 11.

            # So, 11 (subtests) *

            #     5 (repetitions) *

            #     3 (entries per repetition (sync/async/sum)) =

            #     165 entries for test before the sum.

            EXPECTED_ENTRIES = 380 + 166

            if len(results) != EXPECTED_ENTRIES:

                raise Exception(

                    f"StyleBench requires {EXPECTED_ENTRIES} entries, found: {len(results)} instead"

            results = results[:380][75::76] + [results[-1]]

            # pylint --py3k W1619

            return 60 * 1000 / filters.geometric_mean(results) / correctionFactor

        if testname.startswith("raptor-kraken") or "sunspider" in testname:

            return sum(_filter(vals))

        if "unity-webgl" in testname or "webaudio" in testname:

            # webaudio_score and unity_webgl_score: self reported as 'Geometric Mean'

            return filters.mean(_filter(vals, "Geometric Mean"))

        if "assorted-dom" in testname:

            # pylint: disable=W1633

            return round(filters.geometric_mean(_filter(vals)), 2)

        if "wasm-misc" in testname:

            # wasm_misc_score: self reported as '__total__'

            return filters.mean(_filter(vals, "__total__"))

        if "wasm-godot" in testname:

            # wasm_godot_score: first-interactive mean

            return filters.mean(_filter(vals, "first-interactive"))

        if "youtube-playback" in testname:

            # pylint: disable=W1633

            return round(filters.mean(_filter(vals)), 2)

        if "twitch-animation" in testname:

            return round(filters.geometric_mean(_filter(vals, "run")), 2)

        if "ve" in testname:

            if "rt" in testname:

                # We collect the mean and cv of frame-to-frame performance and the

                # frame-dropping rate for both keyframe and non-keyframe. However,

                # the most important factor is the frame-to-frame mean, so we only

                # include it in the summarized score. Note that all the values

                # collected are monitored by "shouldAlert".

                means = [i for i, j in vals if "mean" in j]

                if len(means) > 0:

                    return round(filters.geometric_mean(means), 2)

                return -1

            if "q" in testname:

                if len(vals) > 0:

                    return round(filters.mean(_filter(vals)), 2)

                return -1

            raise NotImplementedError("Summary for %s is not implemented" % testname)

        if testname.startswith("supporting_data"):

            if not unit:

                return sum(_filter(vals))

            if unit == "%":

                return filters.mean(_filter(vals))

            if unit in ("W", "MHz"):

                # For power in Watts and clock frequencies,

                # summarize with the sum of the averages

                allavgs = []

                for val, subtest in vals:

                    if "avg" in subtest:

                        allavgs.append(val)

                if allavgs:

                    return sum(allavgs)

                raise Exception(

                    "No average measurements found for supporting data with W, or MHz unit ."

            if unit in ["KB", "mAh", "mWh"]:

                return sum(_filter(vals))

            raise NotImplementedError("Unit %s not suported" % unit)

        if len(vals) > 1:

            # pylint: disable=W1633

            return round(filters.geometric_mean(_filter(vals)), 2)

        # pylint: disable=W1633

        return round(filters.mean(_filter(vals)), 2)

    def _process_measurements(self, suite, test, measurement_name, replicates):

        subtest = {}

        subtest["name"] = measurement_name

        subtest["lowerIsBetter"] = test["subtest_lower_is_better"]

        subtest["alertThreshold"] = float(test["alert_threshold"])

        unit = test["subtest_unit"]

        if measurement_name == "cpuTime":

            unit = "ms"

        elif measurement_name == "powerUsage":

            unit = "uWh"

        subtest["unit"] = unit

        # Add the alert window settings if needed here too in case

        # there is no summary value in the test

        for schema_name in (

            "minBackWindow",

            "maxBackWindow",

            "foreWindow",

):

            if suite.get(schema_name, None) is not None:

                subtest[schema_name] = suite[schema_name]

        # if 'alert_on' is set for this particular measurement, then we want to set

        # the flag in the perfherder output to turn on alerting for this subtest

        if self.subtest_alert_on is not None:

            if measurement_name in self.subtest_alert_on:

                LOG.info(

                    "turning on subtest alerting for measurement type: %s"

                    % measurement_name

                subtest["shouldAlert"] = True

                if self.app in (

                    "chrome",

                    "chrome-m",

                    "custom-car",

                    "cstm-car-m",

):

                    subtest["shouldAlert"] = False

            else:

                # Explicitly set `shouldAlert` to False so that the measurement

                # is not alerted on. Otherwise Perfherder defaults to alerting.

                LOG.info(

                    "turning off subtest alerting for measurement type: %s"

                    % measurement_name

                subtest["shouldAlert"] = False

        if self.power_test and measurement_name == "powerUsage":

            subtest["shouldAlert"] = True

        subtest["replicates"] = replicates

        return subtest

    def summarize_test(self, test, suite, **kwargs):

        subtests = None

        if "youtube-playback" in test["name"]:

            subtests, vals = self.parseYoutubePlaybackPerformanceOutput(test)

        elif "ve" in test["name"]:

            subtests, vals = self.parseWebCodecsOutput(test)

        else:

            # Attempt to parse the unknown benchmark by flattening the

            # given data and merging all the arrays of non-iterable

            # data that fall under the same key.

            # XXX Note that this is not fully implemented for the summary

            # of the metric or test as we don't have a use case for that yet.

            subtests, vals = self.parseUnknown(test)

        if subtests is None:

            raise Exception("No benchmark metrics found in browsertime results")

        suite["subtests"] = subtests

        self.add_additional_metrics(test, suite)

        # summarize results for both benchmark type tests

        if len(subtests) > 1:

            suite["value"] = self.construct_summary(vals, testname=test["name"])

        subtests.sort(key=lambda subtest: subtest["name"])

    def summarize_suites(self, suites):

        pass

    def report_test_success(self):

        if len(self.failed_tests) > 0:

            LOG.warning("Some tests failed.")

            if self.youtube_playback_failure:

                for test in self.failed_tests:

                    LOG.warning("Youtube sub-test FAILED: %s" % test)

                LOG.warning(

                    "Youtube playback sub-tests failed!!! "

                    "Not submitting results to perfherder!"

            return False

        return True