summaryrefslogtreecommitdiffstats
path: root/testing/raptor/raptor/output.py
blob: b2e2bfa67f1f897637d06d4312bbc3f78880abc5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# some parts of this originally taken from /testing/talos/talos/output.py

"""output raptor test results"""
import copy
import json
import os
import warnings
from abc import ABCMeta, abstractmethod
from collections.abc import Iterable

import filters
import six
from logger.logger import RaptorLogger
from utils import flatten

LOG = RaptorLogger(component="perftest-output")

VISUAL_METRICS = [
    "SpeedIndex",
    "ContentfulSpeedIndex",
    "PerceptualSpeedIndex",
    "FirstVisualChange",
    "LastVisualChange",
    "VisualReadiness",
    "VisualComplete85",
    "VisualComplete95",
    "VisualComplete99",
]

METRIC_BLOCKLIST = [
    "mean",
    "median",
    "geomean",
]


@six.add_metaclass(ABCMeta)
class PerftestOutput(object):
    """Abstract base class to handle output of perftest results"""

    def __init__(
        self, results, supporting_data, subtest_alert_on, app, extra_summary_methods=[]
    ):
        """
        - results : list of RaptorTestResult instances
        """
        self.app = app
        self.results = results
        self.summarized_results = {}
        self.supporting_data = supporting_data
        self.summarized_supporting_data = []
        self.summarized_screenshots = []
        self.subtest_alert_on = subtest_alert_on
        self.browser_name = None
        self.browser_version = None
        self.extra_summary_methods = extra_summary_methods

    @abstractmethod
    def summarize(self, test_names):
        raise NotImplementedError()

    def set_browser_meta(self, browser_name, browser_version):
        # sets the browser metadata for the perfherder data
        self.browser_name = browser_name
        self.browser_version = browser_version

    def summarize_supporting_data(self):
        """
        Supporting data was gathered outside of the main raptor test; it will be kept
        separate from the main raptor test results. Summarize it appropriately.

        supporting_data = {
            'type': 'data-type',
            'test': 'raptor-test-ran-when-data-was-gathered',
            'unit': 'unit that the values are in',
            'summarize-values': True/False,
            'suite-suffix-type': True/False,
            'values': {
                'name': value_dict,
                'nameN': value_dictN
            }
        }

        More specifically, subtest supporting data will look like this:

        supporting_data = {
            'type': 'power',
            'test': 'raptor-speedometer-geckoview',
            'unit': 'mAh',
            'values': {
                'cpu': {
                    'values': val,
                    'lowerIsBetter': True/False,
                    'alertThreshold': 2.0,
                    'subtest-prefix-type': True/False,
                    'unit': 'mWh'
                },
                'wifi': ...
            }
        }

        We want to treat each value as a 'subtest'; and for the overall aggregated
        test result the summary value is dependent on the unit. An exception is
        raised in case we don't know about the specified unit.
        """
        if self.supporting_data is None:
            return

        self.summarized_supporting_data = []
        support_data_by_type = {}

        for data_set in self.supporting_data:

            data_type = data_set["type"]
            LOG.info("summarizing %s data" % data_type)

            if data_type not in support_data_by_type:
                support_data_by_type[data_type] = {
                    "framework": {"name": "raptor"},
                    "suites": [],
                }

            # suite name will be name of the actual raptor test that ran, plus the type of
            # supporting data i.e. 'raptor-speedometer-geckoview-power'
            vals = []
            subtests = []

            suite_name = data_set["test"]
            if data_set.get("suite-suffix-type", True):
                suite_name = "%s-%s" % (data_set["test"], data_set["type"])

            suite = {
                "name": suite_name,
                "type": data_set["type"],
                "subtests": subtests,
            }
            if data_set.get("summarize-values", True):
                suite.update(
                    {
                        "lowerIsBetter": True,
                        "unit": data_set["unit"],
                        "alertThreshold": 2.0,
                    }
                )

            for result in self.results:
                if result["name"] == data_set["test"]:
                    suite["extraOptions"] = result["extra_options"]
                    break

            support_data_by_type[data_type]["suites"].append(suite)
            for measurement_name, value_info in data_set["values"].items():
                # Subtests are expected to be specified in a dictionary, this
                # provides backwards compatibility with the old method
                if not isinstance(value_info, dict):
                    value_info = {"values": value_info}

                new_subtest = {}
                if value_info.get("subtest-prefix-type", True):
                    new_subtest["name"] = data_type + "-" + measurement_name
                else:
                    new_subtest["name"] = measurement_name

                new_subtest["value"] = value_info["values"]
                new_subtest["lowerIsBetter"] = value_info.get("lowerIsBetter", True)
                new_subtest["alertThreshold"] = value_info.get("alertThreshold", 2.0)
                new_subtest["unit"] = value_info.get("unit", data_set["unit"])

                if "shouldAlert" in value_info:
                    new_subtest["shouldAlert"] = value_info.get("shouldAlert")

                subtests.append(new_subtest)
                vals.append([new_subtest["value"], new_subtest["name"]])

            if len(subtests) >= 1 and data_set.get("summarize-values", True):
                suite["value"] = self.construct_summary(
                    vals, testname="supporting_data", unit=data_set["unit"]
                )

        # split the supporting data by type, there will be one
        # perfherder output per type
        for data_type in support_data_by_type:
            data = support_data_by_type[data_type]
            if self.browser_name:
                data["application"] = {"name": self.browser_name}
                if self.browser_version:
                    data["application"]["version"] = self.browser_version
            self.summarized_supporting_data.append(data)

        return

    def output(self, test_names):
        """output to file and perfherder data json"""
        if os.getenv("MOZ_UPLOAD_DIR"):
            # i.e. testing/mozharness/build/raptor.json locally; in production it will
            # be at /tasks/task_*/build/ (where it will be picked up by mozharness later
            # and made into a tc artifact accessible in treeherder as perfherder-data.json)
            results_path = os.path.join(
                os.path.dirname(os.environ["MOZ_UPLOAD_DIR"]), "raptor.json"
            )
            screenshot_path = os.path.join(
                os.path.dirname(os.environ["MOZ_UPLOAD_DIR"]), "screenshots.html"
            )
        else:
            results_path = os.path.join(os.getcwd(), "raptor.json")
            screenshot_path = os.path.join(os.getcwd(), "screenshots.html")

        success = True
        if self.summarized_results == {}:
            success = False
            LOG.error(
                "no summarized raptor results found for any of %s"
                % ", ".join(test_names)
            )
        else:
            for suite in self.summarized_results["suites"]:
                gecko_profiling_enabled = "gecko-profile" in suite.get(
                    "extraOptions", []
                )
                if gecko_profiling_enabled:
                    LOG.info("gecko profiling enabled")
                    suite["shouldAlert"] = False

                # as we do navigation, tname could end in .<alias>
                # test_names doesn't have tname, so either add it to test_names,
                # or strip it
                tname = suite["name"]
                parts = tname.split(".")
                try:
                    tname = ".".join(parts[:-1])
                except Exception as e:
                    LOG.info("no alias found on test, ignoring: %s" % e)
                    pass

                # Since test names might have been modified, check if
                # part of the test name exists in the test_names list entries
                found = False
                for test in test_names:
                    if tname in test:
                        found = True
                        break
                if not found:
                    success = False
                    LOG.error("no summarized raptor results found for %s" % (tname))

            with open(results_path, "w") as f:
                for result in self.summarized_results:
                    f.write("%s\n" % result)

        if len(self.summarized_screenshots) > 0:
            with open(screenshot_path, "w") as f:
                for result in self.summarized_screenshots:
                    f.write("%s\n" % result)
            LOG.info("screen captures can be found locally at: %s" % screenshot_path)

        # now that we've checked for screen captures too, if there were no actual
        # test results we can bail out here
        if self.summarized_results == {}:
            return success, 0

        test_type = self.summarized_results["suites"][0].get("type", "")
        output_perf_data = True
        not_posting = "- not posting regular test results for perfherder"
        if test_type == "scenario":
            # if a resource-usage flag was supplied the perfherder data
            # will still be output from output_supporting_data
            LOG.info("scenario test type was run %s" % not_posting)
            output_perf_data = False

        if self.browser_name:
            self.summarized_results["application"] = {"name": self.browser_name}
            if self.browser_version:
                self.summarized_results["application"]["version"] = self.browser_version

        total_perfdata = 0
        if output_perf_data:
            # if we have supporting data i.e. power, we ONLY want those measurements
            # dumped out. TODO: Bug 1515406 - Add option to output both supplementary
            # data (i.e. power) and the regular Raptor test result
            # Both are already available as separate PERFHERDER_DATA json blobs
            if len(self.summarized_supporting_data) == 0:
                LOG.info("PERFHERDER_DATA: %s" % json.dumps(self.summarized_results))
                total_perfdata = 1
            else:
                LOG.info(
                    "supporting data measurements exist - only posting those to perfherder"
                )

        json.dump(
            self.summarized_results, open(results_path, "w"), indent=2, sort_keys=True
        )
        LOG.info("results can also be found locally at: %s" % results_path)

        return success, total_perfdata

    def output_supporting_data(self, test_names):
        """
        Supporting data was gathered outside of the main raptor test; it has already
        been summarized, now output it appropriately.

        We want to output supporting data in a completely separate perfherder json blob and
        in a corresponding file artifact. This way, supporting data can be ingested as its own
        test suite in perfherder and alerted upon if desired; kept outside of the test results
        from the actual Raptor test which was run when the supporting data was gathered.
        """
        if len(self.summarized_supporting_data) == 0:
            LOG.error(
                "no summarized supporting data found for %s" % ", ".join(test_names)
            )
            return False, 0

        total_perfdata = 0
        for next_data_set in self.summarized_supporting_data:
            data_type = next_data_set["suites"][0]["type"]

            if os.environ["MOZ_UPLOAD_DIR"]:
                # i.e. testing/mozharness/build/raptor.json locally; in production it will
                # be at /tasks/task_*/build/ (where it will be picked up by mozharness later
                # and made into a tc artifact accessible in treeherder as perfherder-data.json)
                results_path = os.path.join(
                    os.path.dirname(os.environ["MOZ_UPLOAD_DIR"]),
                    "raptor-%s.json" % data_type,
                )
            else:
                results_path = os.path.join(os.getcwd(), "raptor-%s.json" % data_type)

            # dump data to raptor-data.json artifact
            json.dump(next_data_set, open(results_path, "w"), indent=2, sort_keys=True)

            # the output that treeherder expects to find
            LOG.info("PERFHERDER_DATA: %s" % json.dumps(next_data_set))
            LOG.info(
                "%s results can also be found locally at: %s"
                % (data_type, results_path)
            )
            total_perfdata += 1

        return True, total_perfdata

    def construct_summary(self, vals, testname, unit=None):
        def _filter(vals, value=None):
            if value is None:
                return [i for i, j in vals]
            return [i for i, j in vals if j == value]

        if testname.startswith("raptor-v8_7"):
            return 100 * filters.geometric_mean(_filter(vals))

        if testname == "speedometer3":
            score = None
            for val, name in vals:
                if name == "score":
                    score = val
            if score is None:
                raise Exception("Unable to find score for Speedometer 3")
            return score

        if "speedometer" in testname:
            correctionFactor = 3
            results = _filter(vals)
            # speedometer has 16 tests, each of these are made of up 9 subtests
            # and a sum of the 9 values.  We receive 160 values, and want to use
            # the 16 test values, not the sub test values.
            if len(results) != 160:
                raise Exception(
                    "Speedometer has 160 subtests, found: %s instead" % len(results)
                )

            results = results[9::10]
            # pylint --py3k W1619
            score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor
            return score

        if "stylebench" in testname:
            # see https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888
            correctionFactor = 3
            results = _filter(vals)

            # stylebench has 5 tests, each of these are made of up 5 subtests
            #
            #   * Adding classes.
            #   * Removing classes.
            #   * Mutating attributes.
            #   * Adding leaf elements.
            #   * Removing leaf elements.
            #
            # which are made of two subtests each (sync/async) and repeated 5 times
            # each, thus, the list here looks like:
            #
            #   [Test name/Adding classes - 0/ Sync; <x>]
            #   [Test name/Adding classes - 0/ Async; <y>]
            #   [Test name/Adding classes - 0; <x> + <y>]
            #   [Test name/Removing classes - 0/ Sync; <x>]
            #   [Test name/Removing classes - 0/ Async; <y>]
            #   [Test name/Removing classes - 0; <x> + <y>]
            #   ...
            #   [Test name/Adding classes - 1 / Sync; <x>]
            #   [Test name/Adding classes - 1 / Async; <y>]
            #   [Test name/Adding classes - 1 ; <x> + <y>]
            #   ...
            #   [Test name/Removing leaf elements - 4; <x> + <y>]
            #   [Test name; <sum>] <- This is what we want.
            #
            # So, 5 (subtests) *
            #     5 (repetitions) *
            #     3 (entries per repetition (sync/async/sum)) =
            #     75 entries for test before the sum.
            #
            # We receive 76 entries per test, which ads up to 380. We want to use
            # the 5 test entries, not the rest.
            if len(results) != 380:
                raise Exception(
                    "StyleBench requires 380 entries, found: %s instead" % len(results)
                )
            results = results[75::76]
            # pylint --py3k W1619
            return 60 * 1000 / filters.geometric_mean(results) / correctionFactor

        if testname.startswith("raptor-kraken") or "sunspider" in testname:
            return sum(_filter(vals))

        if "unity-webgl" in testname or "webaudio" in testname:
            # webaudio_score and unity_webgl_score: self reported as 'Geometric Mean'
            return filters.mean(_filter(vals, "Geometric Mean"))

        if "assorted-dom" in testname:
            # pylint: disable=W1633
            return round(filters.geometric_mean(_filter(vals)), 2)

        if "wasm-misc" in testname:
            # wasm_misc_score: self reported as '__total__'
            return filters.mean(_filter(vals, "__total__"))

        if "wasm-godot" in testname:
            # wasm_godot_score: first-interactive mean
            return filters.mean(_filter(vals, "first-interactive"))

        if "youtube-playback" in testname:
            # pylint: disable=W1633
            return round(filters.mean(_filter(vals)), 2)

        if "twitch-animation" in testname:
            return round(filters.geometric_mean(_filter(vals, "run")), 2)

        if testname.startswith("supporting_data"):
            if not unit:
                return sum(_filter(vals))

            if unit == "%":
                return filters.mean(_filter(vals))

            if unit in ("W", "MHz"):
                # For power in Watts and clock frequencies,
                # summarize with the sum of the averages
                allavgs = []
                for val, subtest in vals:
                    if "avg" in subtest:
                        allavgs.append(val)
                if allavgs:
                    return sum(allavgs)

                raise Exception(
                    "No average measurements found for supporting data with W, or MHz unit ."
                )

            if unit in ["KB", "mAh", "mWh"]:
                return sum(_filter(vals))

            raise NotImplementedError("Unit %s not suported" % unit)

        if len(vals) > 1:
            # pylint: disable=W1633
            return round(filters.geometric_mean(_filter(vals)), 2)

        # pylint: disable=W1633
        return round(filters.mean(_filter(vals)), 2)

    def parseUnknown(self, test):
        # Attempt to flatten whatever we've been given
        # Dictionary keys will be joined by dashes, arrays represent
        # represent "iterations"
        _subtests = {}

        if not isinstance(test["measurements"], dict):
            raise Exception(
                "Expected a dictionary with a single entry as the name of the test. "
                "The value of this key should be the data."
            )
        if test.get("custom_data", False):
            # If custom_data is true it means that the data was already flattened
            # and the test name is included in the keys (the test might have
            # also removed it if it's in the subtest_name_filters option). Handle this
            # exception by wrapping it
            test["measurements"] = {test["name"]: [test["measurements"]]}

        for iteration in test["measurements"][list(test["measurements"].keys())[0]]:
            flattened_metrics = None
            if not test.get("custom_data", False):
                flattened_metrics = flatten(iteration, ())

            for metric, value in (flattened_metrics or iteration).items():
                if metric in METRIC_BLOCKLIST:
                    # TODO: Add an option in the test manifest for this
                    continue
                if metric not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[metric] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": metric,
                        "replicates": [],
                    }
                if not isinstance(value, Iterable):
                    value = [value]
                # pylint: disable=W1633
                _subtests[metric]["replicates"].extend([round(x, 3) for x in value])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        summaries = {
            "median": filters.median,
            "mean": filters.mean,
            "geomean": filters.geometric_mean,
        }
        for name in names:
            summary_method = test.get("submetric_summary_method", "median")
            _subtests[name]["value"] = round(
                summaries[summary_method](_subtests[name]["replicates"]), 3
            )
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseSpeedometerOutput(self, test):
        # each benchmark 'index' becomes a subtest; each pagecycle / iteration
        # of the test has multiple values per index/subtest

        # this is the format we receive the results in from the benchmark
        # i.e. this is ONE pagecycle of speedometer:

        # {u'name': u'raptor-speedometer', u'type': u'benchmark', u'measurements':
        # {u'speedometer': [[{u'AngularJS-TodoMVC/DeletingAllItems': [147.3000000000011,
        # 149.95999999999913, 143.29999999999927, 150.34000000000378, 257.6999999999971],
        # u'Inferno-TodoMVC/CompletingAllItems/Sync': [88.03999999999996,#
        # 85.60000000000036, 94.18000000000029, 95.19999999999709, 86.47999999999593],
        # u'AngularJS-TodoMVC': [518.2400000000016, 525.8199999999997, 610.5199999999968,
        # 532.8200000000215, 640.1800000000003], ...(repeated for each index/subtest)}]]},
        # u'browser': u'Firefox 62.0a1 20180528123052', u'lower_is_better': False, u'page':
        # u'http://localhost:55019/Speedometer/index.html?raptor', u'unit': u'score',
        # u'alert_threshold': 2}

        _subtests = {}
        data = test["measurements"]["speedometer"]
        for page_cycle in data:
            for sub, replicates in page_cycle[0].items():
                # for each pagecycle, build a list of subtests and append all related replicates
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                # pylint: disable=W1633
                _subtests[sub]["replicates"].extend([round(x, 3) for x in replicates])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.median(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseAresSixOutput(self, test):
        """
        https://browserbench.org/ARES-6/
        Every pagecycle will perform the tests from the index page
        We have 4 main tests per index page:
        - Air, Basic, Babylon, ML
        - and from these 4 above, ares6 generates the Overall results
        Each test has 3 subtests (firstIteration, steadyState, averageWorstCase):
        - _steadyState
        - _firstIteration
        - _averageWorstCase
        Each index page will run 5 cycles, this is set in glue.js

        {
            'expected_browser_cycles': 1,
            'subtest_unit': 'ms',
            'name': 'raptor-ares6-firefox',
            'lower_is_better': False,
            'browser_cycle': '1',
            'subtest_lower_is_better': True,
            'cold': False,
            'browser': 'Firefox 69.0a1 20190531035909',
            'type': 'benchmark',
            'page': 'http://127.0.0.1:35369/ARES-6/index.html?raptor',
            'unit': 'ms',
            'alert_threshold': 2
            'measurements': {
                'ares6': [[{
                    'Babylon_firstIteration': [
                        123.68,
                        168.21999999999997,
                        127.34000000000003,
                        113.56,
                        128.78,
                        169.44000000000003
                    ],
                    'Air_steadyState': [
                        21.184723618090434,
                        22.906331658291457,
                        19.939396984924624,
                        20.572462311557775,
                        20.790452261306534,
                        18.378693467336696
                    ],
                    etc.
                }]]
            }
        }

        Details on how /ARES6/index.html is showing the mean on subsequent test results:

        I selected just a small part from the metrics just to be easier to explain
        what is going on.

        After the raptor GeckoView test finishes, we have these results in the logs:

        Extracted from "INFO - raptor-control-server Info: received webext_results:"
        'Air_firstIteration': [660.8000000000002, 626.4599999999999, 655.6199999999999,
        635.9000000000001, 636.4000000000001]

        Extracted from "INFO - raptor-output Info: PERFHERDER_DATA:"
        {"name": "Air_firstIteration", "lowerIsBetter": true, "alertThreshold": 2.0,
        "replicates": [660.8, 626.46, 655.62, 635.9, 636.4], "value": 636.4, "unit": "ms"}

        On GeckoView's /ARES6/index.html this is what we see for Air - First Iteration:

        - on 1st test cycle : 660.80 (rounded from 660.8000000000002)

        - on 2nd test cycle : 643.63 , this is coming from
          (660.8000000000002 + 626.4599999999999) / 2 ,
          then rounded up to a precision of 2 decimals

        - on 3rd test cycle : 647.63 this is coming from
          (660.8000000000002 + 626.4599999999999 + 655.6199999999999) / 3 ,
          then rounded up to a precision of 2 decimals

        - and so on
        """

        _subtests = {}
        data = test["measurements"]["ares6"]

        for page_cycle in data:
            for sub, replicates in page_cycle[0].items():
                # for each pagecycle, build a list of subtests and append all related replicates
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                # pylint: disable=W1633
                _subtests[sub]["replicates"].extend(
                    [float(round(x, 3)) for x in replicates]
                )

        vals = []
        for name, test in _subtests.items():
            test["value"] = filters.mean(test["replicates"])
            vals.append([test["value"], name])

        # pylint W1656
        return list(_subtests.values()), sorted(vals, reverse=True)

    def parseMotionmarkOutput(self, test):
        # for motionmark we want the frameLength:average value for each test

        # this is the format we receive the results in from the benchmark
        # i.e. this is ONE pagecycle of motionmark htmlsuite test:composited Transforms:

        # {u'name': u'raptor-motionmark-firefox',
        #  u'type': u'benchmark',
        #  u'measurements': {
        #    u'motionmark':
        #      [[{u'HTMLsuite':
        #        {u'Composited Transforms':
        #          {u'scoreLowerBound': 272.9947975553528,
        #           u'frameLength': {u'average': 25.2, u'stdev': 27.0,
        #                            u'percent': 68.2, u'concern': 39.5},
        #           u'controller': {u'average': 300, u'stdev': 0, u'percent': 0, u'concern': 3},
        #           u'scoreUpperBound': 327.0052024446473,
        #           u'complexity': {u'segment1': [[300, 16.6], [300, 16.6]], u'complexity': 300,
        #                           u'segment2': [[300, None], [300, None]], u'stdev': 6.8},
        #           u'score': 300.00000000000006,
        #           u'complexityAverage': {u'segment1': [[30, 30], [30, 30]], u'complexity': 30,
        #                                  u'segment2': [[300, 300], [300, 300]], u'stdev': None}
        #  }}}]]}}

        _subtests = {}
        data = test["measurements"]["motionmark"]
        for page_cycle in data:
            page_cycle_results = page_cycle[0]

            # TODO: this assumes a single suite is run
            suite = list(page_cycle_results)[0]
            for sub in page_cycle_results[suite].keys():
                try:
                    # pylint: disable=W1633
                    replicate = round(
                        float(page_cycle_results[suite][sub]["frameLength"]["average"]),
                        3,
                    )
                except TypeError as e:
                    LOG.warning(
                        "[{}][{}] : {} - {}".format(suite, sub, e.__class__.__name__, e)
                    )

                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                _subtests[sub]["replicates"].extend([replicate])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.median(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseYoutubePlaybackPerformanceOutput(self, test):
        """Parse the metrics for the Youtube playback performance test.

        For each video measured values for dropped and decoded frames will be
        available from the benchmark site.

        {u'PlaybackPerf.VP9.2160p60@2X': {u'droppedFrames': 1, u'decodedFrames': 796}

        With each page cycle / iteration of the test multiple values can be present.

        Raptor will calculate the percentage of dropped frames to decoded frames.
        All those three values will then be emitted as separate sub tests.
        """
        _subtests = {}
        test_name = [
            measurement
            for measurement in test["measurements"].keys()
            if "youtube-playback" in measurement
        ]
        if len(test_name) > 0:
            data = test["measurements"].get(test_name[0])
        else:
            raise Exception("No measurements found for youtube test!")

        def create_subtest_entry(
            name,
            value,
            unit=test["subtest_unit"],
            lower_is_better=test["subtest_lower_is_better"],
        ):
            # build a list of subtests and append all related replicates
            if name not in _subtests:
                # subtest not added yet, first pagecycle, so add new one
                _subtests[name] = {
                    "name": name,
                    "unit": unit,
                    "lowerIsBetter": lower_is_better,
                    "replicates": [],
                }

            _subtests[name]["replicates"].append(value)
            if self.subtest_alert_on is not None:
                if name in self.subtest_alert_on:
                    LOG.info(
                        "turning on subtest alerting for measurement type: %s" % name
                    )
                    _subtests[name]["shouldAlert"] = True

        failed_tests = []
        for pagecycle in data:
            for _sub, _value in six.iteritems(pagecycle[0]):
                if _value["decodedFrames"] == 0:
                    failed_tests.append(
                        "%s test Failed. decodedFrames %s droppedFrames %s."
                        % (_sub, _value["decodedFrames"], _value["droppedFrames"])
                    )

                try:
                    percent_dropped = (
                        float(_value["droppedFrames"]) / _value["decodedFrames"] * 100.0
                    )
                except ZeroDivisionError:
                    # if no frames have been decoded the playback failed completely
                    percent_dropped = 100.0

                # Remove the not needed "PlaybackPerf." prefix from each test
                _sub = _sub.split("PlaybackPerf", 1)[-1]
                if _sub.startswith("."):
                    _sub = _sub[1:]

                # build a list of subtests and append all related replicates
                create_subtest_entry(
                    "{}_decoded_frames".format(_sub),
                    _value["decodedFrames"],
                    lower_is_better=False,
                )
                create_subtest_entry(
                    "{}_dropped_frames".format(_sub), _value["droppedFrames"]
                )
                create_subtest_entry(
                    "{}_%_dropped_frames".format(_sub), percent_dropped
                )

        # Check if any youtube test failed and generate exception
        if len(failed_tests) > 0:
            [LOG.warning("Youtube sub-test FAILED: %s" % test) for test in failed_tests]
            # TODO: Change this to raise Exception after we figure out the failing tests
            LOG.warning(
                "Youtube playback sub-tests failed!!! "
                "Not submitting results to perfherder!"
            )
        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            # pylint: disable=W1633
            _subtests[name]["value"] = round(
                float(filters.median(_subtests[name]["replicates"])), 2
            )
            subtests.append(_subtests[name])
            # only include dropped_frames values, without the %_dropped_frames values
            if name.endswith("X_dropped_frames"):
                vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseUnityWebGLOutput(self, test):
        """
        Example output (this is one page cycle):

        {'name': 'raptor-unity-webgl-firefox',
         'type': 'benchmark',
         'measurements': {
            'unity-webgl': [
                [
                    '[{"benchmark":"Mandelbrot GPU","result":1035361},...}]'
                ]
            ]
         },
         'lower_is_better': False,
         'unit': 'score'
        }
        """
        _subtests = {}
        data = test["measurements"]["unity-webgl"]
        for page_cycle in data:
            data = json.loads(page_cycle[0])
            for item in data:
                # for each pagecycle, build a list of subtests and append all related replicates
                sub = item["benchmark"]
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                _subtests[sub]["replicates"].append(item["result"])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.median(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseWebaudioOutput(self, test):
        # each benchmark 'index' becomes a subtest; each pagecycle / iteration
        # of the test has multiple values per index/subtest

        # this is the format we receive the results in from the benchmark
        # i.e. this is ONE pagecycle of speedometer:

        # {u'name': u'raptor-webaudio-firefox', u'type': u'benchmark', u'measurements':
        # {u'webaudio': [[u'[{"name":"Empty testcase","duration":26,"buffer":{}},{"name"
        # :"Simple gain test without resampling","duration":66,"buffer":{}},{"name":"Simple
        # gain test without resampling (Stereo)","duration":71,"buffer":{}},{"name":"Simple
        # gain test without resampling (Stereo and positional)","duration":67,"buffer":{}},
        # {"name":"Simple gain test","duration":41,"buffer":{}},{"name":"Simple gain test
        # (Stereo)","duration":59,"buffer":{}},{"name":"Simple gain test (Stereo and positional)",
        # "duration":68,"buffer":{}},{"name":"Upmix without resampling (Mono -> Stereo)",
        # "duration":53,"buffer":{}},{"name":"Downmix without resampling (Mono -> Stereo)",
        # "duration":44,"buffer":{}},{"name":"Simple mixing (same buffer)",
        # "duration":288,"buffer":{}}

        _subtests = {}
        data = test["measurements"]["webaudio"]
        for page_cycle in data:
            data = json.loads(page_cycle[0])
            for item in data:
                # for each pagecycle, build a list of subtests and append all related replicates
                sub = item["name"]
                replicates = [item["duration"]]
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                # pylint: disable=W1633
                _subtests[sub]["replicates"].extend(
                    [float(round(x, 3)) for x in replicates]
                )

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.median(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        print(subtests)
        return subtests, vals

    def parseWASMGodotOutput(self, test):
        """
        {u'wasm-godot': [
            {
              "name": "wasm-instantiate",
              "time": 349
            },{
              "name": "engine-instantiate",
              "time": 1263
            ...
            }]}
        """
        _subtests = {}
        data = test["measurements"]["wasm-godot"]
        print(data)
        for page_cycle in data:
            for item in page_cycle[0]:
                # for each pagecycle, build a list of subtests and append all related replicates
                sub = item["name"]
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                _subtests[sub]["replicates"].append(item["time"])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.median(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseSunspiderOutput(self, test):
        _subtests = {}
        data = test["measurements"]["sunspider"]
        for page_cycle in data:
            for sub, replicates in page_cycle[0].items():
                # for each pagecycle, build a list of subtests and append all related replicates
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                # pylint: disable=W1633
                _subtests[sub]["replicates"].extend(
                    [float(round(x, 3)) for x in replicates]
                )

        subtests = []
        vals = []

        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.mean(_subtests[name]["replicates"])
            subtests.append(_subtests[name])

            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseAssortedDomOutput(self, test):
        # each benchmark 'index' becomes a subtest; each pagecycle / iteration
        # of the test has multiple values

        # this is the format we receive the results in from the benchmark
        # i.e. this is ONE pagecycle of assorted-dom ('test' is a valid subtest name btw):

        # {u'worker-getname-performance-getter': 5.9, u'window-getname-performance-getter': 6.1,
        # u'window-getprop-performance-getter': 6.1, u'worker-getprop-performance-getter': 6.1,
        # u'test': 5.8, u'total': 30}

        # the 'total' is provided for us from the benchmark; the overall score will be the mean of
        # the totals from all pagecycles; but keep all the subtest values for the logs/json

        _subtests = {}
        data = test["measurements"]["assorted-dom"]
        for pagecycle in data:
            for _sub, _value in pagecycle[0].items():
                # build a list of subtests and append all related replicates
                if _sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[_sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": _sub,
                        "replicates": [],
                    }
                _subtests[_sub]["replicates"].extend([_value])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            # pylint: disable=W1633
            _subtests[name]["value"] = float(
                round(filters.median(_subtests[name]["replicates"]), 2)
            )
            subtests.append(_subtests[name])
            # only use the 'total's to compute the overall result
            if name == "total":
                vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseJetstreamTwoOutput(self, test):
        # https://browserbench.org/JetStream/

        _subtests = {}
        data = test["measurements"]["jetstream2"]
        for page_cycle in data:
            for sub, replicates in page_cycle[0].items():
                # for each pagecycle, build a list of subtests and append all related replicates
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                # pylint: disable=W1633
                _subtests[sub]["replicates"].extend(
                    [float(round(x, 3)) for x in replicates]
                )

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.mean(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseWASMMiscOutput(self, test):
        """
        {u'wasm-misc': [
          [[{u'name': u'validate', u'time': 163.44000000000005},
            ...
            {u'name': u'__total__', u'time': 63308.434904788155}]],
          ...
          [[{u'name': u'validate', u'time': 129.42000000000002},
            {u'name': u'__total__', u'time': 63181.24089257814}]]
         ]}
        """
        _subtests = {}
        data = test["measurements"]["wasm-misc"]
        for page_cycle in data:
            for item in page_cycle[0]:
                # for each pagecycle, build a list of subtests and append all related replicates
                sub = item["name"]
                if sub not in _subtests:
                    # subtest not added yet, first pagecycle, so add new one
                    _subtests[sub] = {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    }
                _subtests[sub]["replicates"].append(item["time"])

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.median(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseMatrixReactBenchOutput(self, test):
        # https://github.com/jandem/matrix-react-bench

        _subtests = {}
        data = test["measurements"]["matrix-react-bench"]
        for page_cycle in data:
            # Each cycle is formatted like `[[iterations, val], [iterations, val2], ...]`
            for iteration, val in page_cycle:
                sub = f"{iteration}-iterations"
                _subtests.setdefault(
                    sub,
                    {
                        "unit": test["subtest_unit"],
                        "alertThreshold": float(test["alert_threshold"]),
                        "lowerIsBetter": test["subtest_lower_is_better"],
                        "name": sub,
                        "replicates": [],
                    },
                )

                # The values produced are far too large for perfherder
                _subtests[sub]["replicates"].append(val)

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.mean(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals

    def parseTwitchAnimationOutput(self, test):
        _subtests = {}

        for metric, data in test["measurements"].items():
            if "perfstat-" not in metric and metric != "twitch-animation":
                # Only keep perfstats or the run metric
                continue
            if metric == "twitch-animation":
                metric = "run"

            # data is just an array with a single number
            for page_cycle in data:
                # Each benchmark cycle is formatted like `[val]`, perfstats
                # are not
                if not isinstance(page_cycle, list):
                    page_cycle = [page_cycle]
                for val in page_cycle:
                    _subtests.setdefault(
                        metric,
                        {
                            "unit": test["subtest_unit"],
                            "alertThreshold": float(test["alert_threshold"]),
                            "lowerIsBetter": test["subtest_lower_is_better"],
                            "name": metric,
                            "replicates": [],
                        },
                    )

                    # The values produced are far too large for perfherder
                    _subtests[metric]["replicates"].append(val)

        vals = []
        subtests = []
        names = list(_subtests)
        names.sort(reverse=True)
        for name in names:
            _subtests[name]["value"] = filters.mean(_subtests[name]["replicates"])
            subtests.append(_subtests[name])
            vals.append([_subtests[name]["value"], name])

        return subtests, vals


class RaptorOutput(PerftestOutput):
    """class for raptor output"""

    def summarize(self, test_names):
        suites = []
        test_results = {"framework": {"name": "raptor"}, "suites": suites}

        # check if we actually have any results
        if len(self.results) == 0:
            LOG.error("no raptor test results found for %s" % ", ".join(test_names))
            return

        for test in self.results:
            vals = []
            subtests = []
            suite = {
                "name": test["name"],
                "type": test["type"],
                "tags": test.get("tags", []),
                "extraOptions": test["extra_options"],
                "subtests": subtests,
                "lowerIsBetter": test["lower_is_better"],
                "unit": test["unit"],
                "alertThreshold": float(test["alert_threshold"]),
            }

            # Check if optional properties have been set by the test
            if hasattr(test, "alert_change_type"):
                suite["alertChangeType"] = test["alert_change_type"]

            # if cold load add that info to the suite result dict; this will be used later
            # when combining the results from multiple browser cycles into one overall result
            if test["cold"] is True:
                suite["cold"] = True
                suite["browser_cycle"] = int(test["browser_cycle"])
                suite["expected_browser_cycles"] = int(test["expected_browser_cycles"])
                suite["tags"].append("cold")
            else:
                suite["tags"].append("warm")

            suites.append(suite)

            # process results for pageloader type of tests
            if test["type"] in ("pageload", "scenario"):
                # each test can report multiple measurements per pageload
                # each measurement becomes a subtest inside the 'suite'

                # this is the format we receive the results in from the pageload test
                # i.e. one test (subtest) in raptor-firefox-tp6:

                # {u'name': u'raptor-firefox-tp6-amazon', u'type': u'pageload', u'measurements':
                # {u'fnbpaint': [788, 315, 334, 286, 318, 276, 296, 296, 292, 285, 268, 277, 274,
                # 328, 295, 290, 286, 270, 279, 280, 346, 303, 308, 398, 281]}, u'browser':
                # u'Firefox 62.0a1 20180528123052', u'lower_is_better': True, u'page':
                # u'https://www.amazon.com/s/url=search-alias%3Daps&field-keywords=laptop',
                # u'unit': u'ms', u'alert_threshold': 2}

                for measurement_name, replicates in test["measurements"].items():
                    new_subtest = {}
                    new_subtest["name"] = measurement_name
                    new_subtest["replicates"] = replicates
                    new_subtest["lowerIsBetter"] = test["subtest_lower_is_better"]
                    new_subtest["alertThreshold"] = float(test["alert_threshold"])
                    new_subtest["value"] = 0
                    new_subtest["unit"] = test["subtest_unit"]

                    if test["cold"] is False:
                        # for warm page-load, ignore first value due to 1st pageload noise
                        LOG.info(
                            "ignoring the first %s value due to initial pageload noise"
                            % measurement_name
                        )
                        filtered_values = filters.ignore_first(
                            new_subtest["replicates"], 1
                        )
                    else:
                        # for cold-load we want all the values
                        filtered_values = new_subtest["replicates"]

                    # for pageload tests that measure TTFI: TTFI is not guaranteed to be available
                    # everytime; the raptor measure.js webext will substitute a '-1' value in the
                    # cases where TTFI is not available, which is acceptable; however we don't want
                    # to include those '-1' TTFI values in our final results calculations
                    if measurement_name == "ttfi":
                        filtered_values = filters.ignore_negative(filtered_values)
                        # we've already removed the first pageload value; if there aren't any more
                        # valid TTFI values available for this pageload just remove it from results
                        if len(filtered_values) < 1:
                            continue

                    # if 'alert_on' is set for this particular measurement, then we want to set the
                    # flag in the perfherder output to turn on alerting for this subtest
                    if self.subtest_alert_on is not None:
                        if measurement_name in self.subtest_alert_on:
                            LOG.info(
                                "turning on subtest alerting for measurement type: %s"
                                % measurement_name
                            )
                            new_subtest["shouldAlert"] = True
                        else:
                            # Explicitly set `shouldAlert` to False so that the measurement
                            # is not alerted on. Otherwise Perfherder defaults to alerting
                            LOG.info(
                                "turning off subtest alerting for measurement type: %s"
                                % measurement_name
                            )
                            new_subtest["shouldAlert"] = False

                    new_subtest["value"] = filters.median(filtered_values)

                    vals.append([new_subtest["value"], new_subtest["name"]])
                    subtests.append(new_subtest)

            elif test["type"] == "benchmark":

                if any(
                    [
                        "youtube-playback" in measurement
                        for measurement in test["measurements"].keys()
                    ]
                ):
                    subtests, vals = self.parseYoutubePlaybackPerformanceOutput(test)
                elif "assorted-dom" in test["measurements"]:
                    subtests, vals = self.parseAssortedDomOutput(test)
                elif "ares6" in test["measurements"]:
                    subtests, vals = self.parseAresSixOutput(test)
                elif "jetstream2" in test["measurements"]:
                    subtests, vals = self.parseJetstreamTwoOutput(test)
                elif "motionmark" in test["measurements"]:
                    subtests, vals = self.parseMotionmarkOutput(test)
                elif "speedometer" in test["measurements"]:
                    # this includes stylebench
                    subtests, vals = self.parseSpeedometerOutput(test)
                elif "sunspider" in test["measurements"]:
                    subtests, vals = self.parseSunspiderOutput(test)
                elif "unity-webgl" in test["measurements"]:
                    subtests, vals = self.parseUnityWebGLOutput(test)
                elif "wasm-godot" in test["measurements"]:
                    subtests, vals = self.parseWASMGodotOutput(test)
                elif "wasm-misc" in test["measurements"]:
                    subtests, vals = self.parseWASMMiscOutput(test)
                elif "webaudio" in test["measurements"]:
                    subtests, vals = self.parseWebaudioOutput(test)
                else:
                    subtests, vals in self.parseUnknown(test)

                suite["subtests"] = subtests

            else:
                LOG.error(
                    "output.summarize received unsupported test results type for %s"
                    % test["name"]
                )
                return

            suite["tags"].append(test["type"])

            # for benchmarks there is generally more than one subtest in each cycle
            # and a benchmark-specific formula is needed to calculate the final score
            # we no longer summarise the page load as we alert on individual subtests
            # and the geometric mean was found to be of little value
            if len(subtests) > 1 and test["type"] != "pageload":
                suite["value"] = self.construct_summary(vals, testname=test["name"])

            subtests.sort(key=lambda subtest: subtest["name"])
            suite["tags"].sort()

        suites.sort(key=lambda suite: suite["name"])

        self.summarized_results = test_results

    def combine_browser_cycles(self):
        """
        At this point the results have been summarized; however there may have been multiple
        browser cycles (i.e. cold load). In which case the results have one entry for each
        test for each browser cycle. For each test we need to combine the results for all
        browser cycles into one results entry.

        For example, this is what the summarized results suites list looks like from a test that
        was run with multiple (two) browser cycles:

        [{'expected_browser_cycles': 2, 'extraOptions': [],
            'name': u'raptor-tp6m-amazon-geckoview-cold', 'lowerIsBetter': True,
            'alertThreshold': 2.0, 'value': 1776.94, 'browser_cycle': 1,
            'subtests': [{'name': u'dcf', 'lowerIsBetter': True, 'alertThreshold': 2.0,
                'value': 818, 'replicates': [818], 'unit': u'ms'}, {'name': u'fcp',
                'lowerIsBetter': True, 'alertThreshold': 2.0, 'value': 1131, 'shouldAlert': True,
                'replicates': [1131], 'unit': u'ms'}, {'name': u'fnbpaint', 'lowerIsBetter': True,
                'alertThreshold': 2.0, 'value': 1056, 'replicates': [1056], 'unit': u'ms'},
                {'name': u'ttfi', 'lowerIsBetter': True, 'alertThreshold': 2.0, 'value': 18074,
                'replicates': [18074], 'unit': u'ms'}, {'name': u'loadtime', 'lowerIsBetter': True,
                'alertThreshold': 2.0, 'value': 1002, 'shouldAlert': True, 'replicates': [1002],
                'unit': u'ms'}],
            'cold': True, 'type': u'pageload', 'unit': u'ms'},
        {'expected_browser_cycles': 2, 'extraOptions': [],
            'name': u'raptor-tp6m-amazon-geckoview-cold', 'lowerIsBetter': True,
            'alertThreshold': 2.0, 'value': 840.25, 'browser_cycle': 2,
            'subtests': [{'name': u'dcf', 'lowerIsBetter': True, 'alertThreshold': 2.0,
                'value': 462, 'replicates': [462], 'unit': u'ms'}, {'name': u'fcp',
                'lowerIsBetter': True, 'alertThreshold': 2.0, 'value': 718, 'shouldAlert': True,
                'replicates': [718], 'unit': u'ms'}, {'name': u'fnbpaint', 'lowerIsBetter': True,
                'alertThreshold': 2.0, 'value': 676, 'replicates': [676], 'unit': u'ms'},
                {'name': u'ttfi', 'lowerIsBetter': True, 'alertThreshold': 2.0, 'value': 3084,
                'replicates': [3084], 'unit': u'ms'}, {'name': u'loadtime', 'lowerIsBetter': True,
                'alertThreshold': 2.0, 'value': 605, 'shouldAlert': True, 'replicates': [605],
                'unit': u'ms'}],
            'cold': True, 'type': u'pageload', 'unit': u'ms'}]

        Need to combine those into a single entry.
        """
        # check if we actually have any results
        if len(self.results) == 0:
            LOG.info(
                "error: no raptor test results found, so no need to combine browser cycles"
            )
            return

        # first build a list of entries that need to be combined; and as we do that, mark the
        # original suite entry as up for deletion, so once combined we know which ones to del
        # note that summarized results are for all tests that were ran in the session, which
        # could include cold and / or warm page-load and / or benchnarks combined
        suites_to_be_combined = []
        combined_suites = []

        for _index, suite in enumerate(self.summarized_results.get("suites", [])):
            if suite.get("cold") is None:
                continue

            if suite["expected_browser_cycles"] > 1:
                _name = suite["name"]
                _details = suite.copy()
                suites_to_be_combined.append({"name": _name, "details": _details})
                suite["to_be_deleted"] = True

        # now create a new suite entry that will have all the results from
        # all of the browser cycles, but in one result entry for each test
        combined_suites = {}

        for next_suite in suites_to_be_combined:
            suite_name = next_suite["details"]["name"]
            browser_cycle = next_suite["details"]["browser_cycle"]
            LOG.info(
                "combining results from browser cycle %d for %s"
                % (browser_cycle, suite_name)
            )
            if suite_name not in combined_suites:
                # first browser cycle so just take entire entry to start with
                combined_suites[suite_name] = next_suite["details"]
                LOG.info("created new combined result with intial cycle replicates")
                # remove the 'cold', 'browser_cycle', and 'expected_browser_cycles' info
                # as we don't want that showing up in perfherder data output
                del combined_suites[suite_name]["cold"]
                del combined_suites[suite_name]["browser_cycle"]
                del combined_suites[suite_name]["expected_browser_cycles"]
            else:
                # subsequent browser cycles, already have an entry; just add subtest replicates
                for next_subtest in next_suite["details"]["subtests"]:
                    # find the existing entry for that subtest in our new combined test entry
                    found_subtest = False
                    for combined_subtest in combined_suites[suite_name]["subtests"]:
                        if combined_subtest["name"] == next_subtest["name"]:
                            # add subtest (measurement type) replicates to the combined entry
                            LOG.info("adding replicates for %s" % next_subtest["name"])
                            combined_subtest["replicates"].extend(
                                next_subtest["replicates"]
                            )
                            found_subtest = True
                    # the subtest / measurement type wasn't found in our existing combined
                    # result entry; if it is for the same suite name add it - this could happen
                    # as ttfi may not be available in every browser cycle
                    if not found_subtest:
                        LOG.info("adding replicates for %s" % next_subtest["name"])
                        combined_suites[next_suite["details"]["name"]][
                            "subtests"
                        ].append(next_subtest)

        # now we have a single entry for each test; with all replicates from all browser cycles
        for i, name in enumerate(combined_suites):
            vals = []
            for next_sub in combined_suites[name]["subtests"]:
                # calculate sub-test results (i.e. each measurement type)
                next_sub["value"] = filters.median(next_sub["replicates"])
                # add to vals; vals is used to calculate overall suite result i.e. the
                # geomean of all of the subtests / measurement types
                vals.append([next_sub["value"], next_sub["name"]])

            # calculate overall suite result ('value') which is geomean of all measures
            if len(combined_suites[name]["subtests"]) > 1:
                combined_suites[name]["value"] = self.construct_summary(
                    vals, testname=name
                )

            # now add the combined suite entry to our overall summarized results!
            self.summarized_results["suites"].append(combined_suites[name])

        # now it is safe to delete the original entries that were made by each cycle
        self.summarized_results["suites"] = [
            item
            for item in self.summarized_results["suites"]
            if item.get("to_be_deleted") is not True
        ]

    def summarize_screenshots(self, screenshots):
        if len(screenshots) == 0:
            return

        self.summarized_screenshots.append(
            """<!DOCTYPE html>
        <head>
        <style>
            table, th, td {
              border: 1px solid black;
              border-collapse: collapse;
            }
        </style>
        </head>
        <html> <body>
        <h1>Captured screenshots!</h1>
        <table style="width:100%">
          <tr>
            <th>Test Name</th>
            <th>Pagecycle</th>
            <th>Screenshot</th>
          </tr>"""
        )

        for screenshot in screenshots:
            self.summarized_screenshots.append(
                """<tr>
            <th>%s</th>
            <th> %s</th>
            <th>
                <img src="%s" alt="%s %s" width="320" height="240">
            </th>
            </tr>"""
                % (
                    screenshot["test_name"],
                    screenshot["page_cycle"],
                    screenshot["screenshot"],
                    screenshot["test_name"],
                    screenshot["page_cycle"],
                )
            )

        self.summarized_screenshots.append("""</table></body> </html>""")


class BrowsertimeOutput(PerftestOutput):
    """class for browsertime output"""

    def summarize(self, test_names):
        """
        Summarize the parsed browsertime test output, and format accordingly so the output can
        be ingested by Perfherder.

        At this point each entry in self.results for browsertime-pageload tests is in this format:

        {'statistics':{'fcp': {u'p99': 932, u'mdev': 10.0941, u'min': 712, u'p90': 810, u'max':
        932, u'median': 758, u'p10': 728, u'stddev': 50, u'mean': 769}, 'dcf': {u'p99': 864,
        u'mdev': 11.6768, u'min': 614, u'p90': 738, u'max': 864, u'median': 670, u'p10': 632,
        u'stddev': 58, u'mean': 684}, 'fnbpaint': {u'p99': 830, u'mdev': 9.6851, u'min': 616,
        u'p90': 719, u'max': 830, u'median': 668, u'p10': 642, u'stddev': 48, u'mean': 680},
        'loadtime': {u'p99': 5818, u'mdev': 111.7028, u'min': 3220, u'p90': 4450, u'max': 5818,
        u'median': 3476, u'p10': 3241, u'stddev': 559, u'mean': 3642}}, 'name':
        'raptor-tp6-guardian-firefox', 'url': 'https://www.theguardian.co.uk', 'lower_is_better':
        True, 'measurements': {'fcp': [932, 744, 744, 810, 712, 775, 759, 744, 777, 739, 809, 906,
        734, 742, 760, 758, 728, 792, 757, 759, 742, 759, 775, 726, 730], 'dcf': [864, 679, 637,
        662, 652, 651, 710, 679, 646, 689, 686, 845, 670, 694, 632, 703, 670, 738, 633, 703, 614,
        703, 650, 622, 670], 'fnbpaint': [830, 648, 666, 704, 616, 683, 678, 650, 685, 651, 719,
        820, 634, 664, 681, 664, 642, 703, 668, 670, 669, 668, 681, 652, 642], 'loadtime': [4450,
        3592, 3770, 3345, 3453, 3220, 3434, 3621, 3511, 3416, 3430, 5818, 4729, 3406, 3506, 3588,
        3245, 3381, 3707, 3241, 3595, 3483, 3236, 3390, 3476]}, 'subtest_unit': 'ms', 'bt_ver':
        '4.9.2-android', 'alert_threshold': 2, 'cold': True, 'type': 'browsertime-pageload',
        'unit': 'ms', 'browser': "{u'userAgent': u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13;
        rv:70.0) Gecko/20100101 Firefox/70.0', u'windowSize': u'1366x694'}"}

        Now we must process this further and prepare the result for output suitable for perfherder
        ingestion.

        Note: For the overall subtest values/results (i.e. for each measurement type) we will use
        the Browsertime-provided statistics, instead of calcuating our own geomeans from the
        replicates.
        """

        def _filter_data(data, method, subtest_name):
            import numpy as np
            from scipy.cluster.vq import kmeans2, whiten

            """
            Take the kmeans of the data, and attempt to filter this way.
            We'll use hard-coded values to get rid of data that is 2x
            smaller/larger than the majority of the data. If the data approaches
            a 35%/65% split, then it won't be filtered as we can't figure
            out which one is the right mean to take.

            The way that this will work for multi-modal data (more than 2 modes)
            is that the majority of the modes will end up in either one bin or
            the other. Taking the group with the most points lets us
            consistently remove very large outliers out of the data and target the
            modes with the largest prominence.

            TODO: The seed exists because using a randomized one often gives us
            multiple results on the same dataset. This should keep things more
            consistent from one task to the next. We should also look into playing
            with iterations, but that comes at the cost of processing time (this
            might not be a valid concern).
            """
            data = np.asarray(data)

            # Disable kmeans2 empty cluster warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                kmeans, result = kmeans2(
                    whiten(np.asarray([float(d) for d in data])), 2, seed=1000
                )

            if len(kmeans) < 2:
                # Default to a gaussian filter if we didn't get 2 means
                summary_method = np.mean
                if method == "geomean":
                    filters.geometric_mean

                # Apply a gaussian filter
                data = data[
                    np.where(data > summary_method(data) - (np.std(data) * 2))[0]
                ]
                data = list(
                    data[np.where(data < summary_method(data) + (np.std(data) * 2))[0]]
                )
            else:
                first_group = data[np.where(result == 0)]
                secnd_group = data[np.where(result == 1)]

                total_len = len(data)
                first_len = len(first_group)
                secnd_len = len(secnd_group)

                ratio = np.ceil((min(first_len, secnd_len) / total_len) * 100)
                if ratio <= 35:
                    # If one of the groups are less than 35% of the total
                    # size, then filter it out if the difference in the
                    # k-means are large enough (200% difference).
                    max_mean = max(kmeans)
                    min_mean = min(kmeans)
                    if abs(max_mean / min_mean) > 2:
                        major_group = first_group
                        major_mean = np.mean(first_group) if first_len > 0 else 0
                        minor_mean = np.mean(secnd_group) if secnd_len > 0 else 0
                        if first_len < secnd_len:
                            major_group = secnd_group
                            tmp = major_mean
                            major_mean = minor_mean
                            minor_mean = tmp

                        LOG.info(
                            f"{subtest_name}: Filtering out {total_len - len(major_group)} "
                            f"data points found in minor_group of data with "
                            f"mean {minor_mean} vs. {major_mean} in major group"
                        )
                        data = major_group

            return data

        def _process_alt_method(subtest, alternative_method):
            # Don't filter with less than 10 data points
            data = subtest["replicates"]
            if len(subtest["replicates"]) > 10:
                data = _filter_data(data, alternative_method, subtest["name"])
            if alternative_method == "geomean":
                subtest["value"] = round(filters.geometric_mean(data), 1)
            elif alternative_method == "mean":
                subtest["value"] = round(filters.mean(data), 1)

        # converting suites and subtests into lists, and sorting them
        def _process(subtest, alternative_method=""):
            if test["type"] == "power":
                subtest["value"] = filters.mean(subtest["replicates"])
            elif (
                subtest["name"] in VISUAL_METRICS
                or subtest["name"].startswith("perfstat")
                or subtest["name"] == "cpuTime"
            ):
                if alternative_method in ("geomean", "mean"):
                    _process_alt_method(subtest, alternative_method)
                else:
                    subtest["value"] = filters.median(subtest["replicates"])
            else:
                if alternative_method in ("geomean", "mean"):
                    _process_alt_method(subtest, alternative_method)
                else:
                    subtest["value"] = filters.median(
                        filters.ignore_first(subtest["replicates"], 1)
                    )
            return subtest

        def _process_suite(suite):
            suite["subtests"] = [
                _process(subtest)
                for subtest in suite["subtests"].values()
                if subtest["replicates"]
            ]

            # Duplicate for different summary values if needed
            if self.extra_summary_methods:
                new_subtests = []
                for subtest in suite["subtests"]:
                    try:
                        for alternative_method in self.extra_summary_methods:
                            new_subtest = copy.deepcopy(subtest)
                            new_subtest[
                                "name"
                            ] = f"{new_subtest['name']} ({alternative_method})"
                            _process(new_subtest, alternative_method)
                            new_subtests.append(new_subtest)
                    except Exception as e:
                        # Ignore failures here
                        LOG.info(f"Failed to summarize with alternative methods: {e}")
                        pass
                suite["subtests"].extend(new_subtests)

            suite["subtests"].sort(key=lambda subtest: subtest["name"])

            # for benchmarks there is generally more than one subtest in each cycle
            # and a benchmark-specific formula is needed to calculate the final score
            # we no longer summarise the page load as we alert on individual subtests
            # and the geometric mean was found to be of little value
            if len(suite["subtests"]) > 1 and suite["type"] != "pageload":
                vals = [
                    [subtest["value"], subtest["name"]] for subtest in suite["subtests"]
                ]
                testname = suite["name"]
                if suite["type"] == "power":
                    testname = "supporting_data"
                suite["value"] = self.construct_summary(vals, testname=testname)
            return suite

        LOG.info("preparing browsertime results for output")

        # check if we actually have any results
        if len(self.results) == 0:
            LOG.error(
                "no browsertime test results found for %s" % ", ".join(test_names)
            )
            return

        test_results = {"framework": {"name": "browsertime"}}

        # using a mapping so we can have a unique set of results given a name
        suites = {}

        for test in self.results:
            test_name = test["name"]
            extra_options = test["extra_options"]

            # If a test with the same name has different extra options, handle it
            # by appending the difference in options to the key used in `suites`. We
            # need to do a loop here in case we get a conflicting test name again.
            prev_name = test_name
            while (
                test_name in suites
                and suites[test_name]["extraOptions"] != extra_options
            ):
                missing = set(extra_options) - set(suites[test_name]["extraOptions"])
                if len(missing) == 0:
                    missing = set(suites[test_name]["extraOptions"]) - set(
                        extra_options
                    )
                test_name = test_name + "-".join(list(missing))

                if prev_name == test_name:
                    # Kill the loop if we get the same name again
                    break
                else:
                    prev_name = test_name

            suite = suites.setdefault(
                test_name,
                {
                    "name": test["name"],
                    "type": test["type"],
                    "extraOptions": extra_options,
                    # There may be unique options in tags now, but we don't want to remove the
                    # previous behaviour which includes the extra options in the tags.
                    "tags": test.get("tags", []) + extra_options,
                    "lowerIsBetter": test["lower_is_better"],
                    "unit": test["unit"],
                    "alertThreshold": float(test["alert_threshold"]),
                    # like suites, subtests are identified by names
                    "subtests": {},
                },
            )
            # Add the alert window settings if needed
            for alert_option, schema_name in (
                ("min_back_window", "minBackWindow"),
                ("max_back_window", "maxBackWindow"),
                ("fore_window", "foreWindow"),
            ):
                if test.get(alert_option, None) is not None:
                    suite[schema_name] = int(test[alert_option])

            # Setting shouldAlert to False whenever self.app is either chrome, chrome-m, chromium, chromium-as-release
            if self.app in ("chrome", "chrome-m", "chromium", "custom-car"):
                suite["shouldAlert"] = False
            # Check if the test has set optional properties
            if "alert_change_type" in test and "alertChangeType" not in suite:
                suite["alertChangeType"] = test["alert_change_type"]

            def _process_measurements(measurement_name, replicates):
                subtest = {}
                subtest["name"] = measurement_name
                subtest["lowerIsBetter"] = test["subtest_lower_is_better"]
                subtest["alertThreshold"] = float(test["alert_threshold"])
                subtest["unit"] = (
                    "ms" if measurement_name == "cpuTime" else test["subtest_unit"]
                )

                # Add the alert window settings if needed here too in case
                # there is no summary value in the test
                for schema_name in (
                    "minBackWindow",
                    "maxBackWindow",
                    "foreWindow",
                ):
                    if suite.get(schema_name, None) is not None:
                        subtest[schema_name] = suite[schema_name]

                # if 'alert_on' is set for this particular measurement, then we want to set
                # the flag in the perfherder output to turn on alerting for this subtest
                if self.subtest_alert_on is not None:
                    if measurement_name in self.subtest_alert_on:
                        LOG.info(
                            "turning on subtest alerting for measurement type: %s"
                            % measurement_name
                        )
                        subtest["shouldAlert"] = True
                        if self.app in ("chrome", "chrome-m", "chromium", "custom-car"):
                            subtest["shouldAlert"] = False
                    else:
                        # Explicitly set `shouldAlert` to False so that the measurement
                        # is not alerted on. Otherwise Perfherder defaults to alerting.
                        LOG.info(
                            "turning off subtest alerting for measurement type: %s"
                            % measurement_name
                        )
                        subtest["shouldAlert"] = False
                subtest["replicates"] = replicates
                return subtest

            if test["type"] in ["pageload", "scenario", "power"]:
                for measurement_name, replicates in test["measurements"].items():
                    new_subtest = _process_measurements(measurement_name, replicates)
                    if measurement_name not in suite["subtests"]:
                        suite["subtests"][measurement_name] = new_subtest
                    else:
                        suite["subtests"][measurement_name]["replicates"].extend(
                            new_subtest["replicates"]
                        )

            elif "benchmark" in test["type"]:
                subtests = None

                if "speedometer" in test["measurements"]:
                    # this includes stylebench
                    subtests, vals = self.parseSpeedometerOutput(test)
                elif "ares6" in test["name"]:
                    subtests, vals = self.parseAresSixOutput(test)
                elif "motionmark" in test["measurements"]:
                    subtests, vals = self.parseMotionmarkOutput(test)
                elif "youtube-playback" in test["name"]:
                    subtests, vals = self.parseYoutubePlaybackPerformanceOutput(test)
                elif "unity-webgl" in test["name"]:
                    subtests, vals = self.parseUnityWebGLOutput(test)
                elif "webaudio" in test["measurements"]:
                    subtests, vals = self.parseWebaudioOutput(test)
                elif "wasm-godot" in test["measurements"]:
                    subtests, vals = self.parseWASMGodotOutput(test)
                elif "wasm-misc" in test["measurements"]:
                    subtests, vals = self.parseWASMMiscOutput(test)
                elif "sunspider" in test["measurements"]:
                    subtests, vals = self.parseSunspiderOutput(test)
                elif "assorted-dom" in test["measurements"]:
                    subtests, vals = self.parseAssortedDomOutput(test)
                elif "jetstream2" in test["measurements"]:
                    subtests, vals = self.parseJetstreamTwoOutput(test)
                elif "matrix-react-bench" in test["name"]:
                    subtests, vals = self.parseMatrixReactBenchOutput(test)
                elif "twitch-animation" in test["name"]:
                    subtests, vals = self.parseTwitchAnimationOutput(test)
                else:
                    # Attempt to parse the unknown benchmark by flattening the
                    # given data and merging all the arrays of non-iterable
                    # data that fall under the same key.
                    # XXX Note that this is not fully implemented for the summary
                    # of the metric or test as we don't have a use case for that yet.
                    subtests, vals = self.parseUnknown(test)

                if subtests is None:
                    raise Exception("No benchmark metrics found in browsertime results")

                suite["subtests"] = subtests

                if "cpuTime" in test["measurements"] and test.get(
                    "gather_cpuTime", None
                ):
                    replicates = test["measurements"]["cpuTime"]
                    cpu_subtest = _process_measurements("cpuTime", replicates)
                    _process(cpu_subtest)
                    suite["subtests"].append(cpu_subtest)

                # summarize results for both benchmark type tests
                if len(subtests) > 1:
                    suite["value"] = self.construct_summary(vals, testname=test["name"])
                subtests.sort(key=lambda subtest: subtest["name"])

        # convert suites to list
        suites = [
            s if "benchmark" in s["type"] else _process_suite(s)
            for s in suites.values()
        ]

        suites.sort(key=lambda suite: suite["name"])

        test_results["suites"] = suites
        self.summarized_results = test_results