# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. from collections import defaultdict from pathlib import Path from mozperftest.metrics.exceptions import ( MetricsMissingResultsError, MetricsMultipleTransformsError, ) from mozperftest.metrics.notebook import PerftestETL from mozperftest.metrics.utils import metric_fields, validate_intermediate_results COMMON_ARGS = { "metrics": { "type": metric_fields, "nargs": "*", "default": [], "help": "The metrics that should be retrieved from the data.", }, "prefix": {"type": str, "default": "", "help": "Prefix used by the output files."}, "split-by": { "type": str, "default": None, "help": "A metric name to use for splitting the data. For instance, " "using browserScripts.pageinfo.url will split the data by the unique " "URLs that are found.", }, "simplify-names": { "action": "store_true", "default": False, "help": "If set, metric names will be simplified to a single word. The PerftestETL " "combines dictionary keys by `.`, and the final key contains that value of the data. " "That final key becomes the new name of the metric.", }, "simplify-exclude": { "nargs": "*", "default": ["statistics"], "help": "When renaming/simplifying metric names, entries with these strings " "will be ignored and won't get simplified. These options are only used when " "--simplify-names is set.", }, "transformer": { "type": str, "default": None, "help": "The path to the file containing the custom transformer, " "or the module to import along with the class name, " "e.g. mozperftest.test.xpcshell:XpcShellTransformer", }, } class MetricsStorage(object): """Holds data that is commonly used across all metrics layers. An instance of this class represents data for a given and output path and prefix. """ def __init__(self, output_path, prefix, logger): self.prefix = prefix self.output_path = output_path self.stddata = {} self.ptnb_config = {} self.results = [] self.logger = logger p = Path(output_path) p.mkdir(parents=True, exist_ok=True) def _parse_results(self, results): if isinstance(results, dict): return [results] res = [] # XXX we need to embrace pathlib everywhere. if isinstance(results, (str, Path)): # Expecting a single path or a directory p = Path(results) if not p.exists(): self.logger.warning("Given path does not exist: {}".format(results)) elif p.is_dir(): files = [f for f in p.glob("**/*.json") if not f.is_dir()] res.extend(self._parse_results(files)) else: res.append(p.as_posix()) if isinstance(results, list): # Expecting a list of paths for path in results: res.extend(self._parse_results(path)) return res def set_results(self, results): """Processes and sets results provided by the metadata. `results` can be a path to a file or a directory. Every file is scanned and we build a list. Alternatively, it can be a mapping containing the results, in that case we just use it direcly, but keep it in a list. :param results list/dict/str: Path, or list of paths to the data (or the data itself in a dict) of the data to be processed. """ # Parse the results into files (for now) and the settings self.results = defaultdict(lambda: defaultdict(list)) self.settings = defaultdict(dict) for res in results: # Ensure that the results are valid before continuing validate_intermediate_results(res) name = res["name"] if isinstance(res["results"], dict): # XXX Implement subtest based parsing raise NotImplementedError( "Subtest-based processing is not implemented yet" ) # Merge all entries with the same name into one # result, if separation is needed use unique names self.results[name]["files"].extend(self._parse_results(res["results"])) suite_settings = self.settings[name] for key, val in res.items(): if key == "results": continue suite_settings[key] = val # Check the transform definitions currtrfm = self.results[name]["transformer"] if not currtrfm: self.results[name]["transformer"] = res.get( "transformer", "SingleJsonRetriever" ) elif currtrfm != res.get("transformer", "SingleJsonRetriever"): raise MetricsMultipleTransformsError( f"Only one transformer allowed per data name! Found multiple for {name}: " f"{[currtrfm, res['transformer']]}" ) # Get the transform options if available self.results[name]["options"] = res.get("transformer-options", {}) if not self.results: self.return_code = 1 raise MetricsMissingResultsError("Could not find any results to process.") def get_standardized_data(self, group_name="firefox", transformer=None): """Returns a parsed, standardized results data set. The dataset is computed once then cached unless overwrite is used. The transformer dictates how the data will be parsed, by default it uses a JSON transformer that flattens the dictionary while merging all the common metrics together. :param group_name str: The name for this results group. :param transformer str: The name of the transformer to use when parsing the data. Currently, only SingleJsonRetriever is available. :param overwrite str: if True, we recompute the results :return dict: Standardized notebook data with containing the requested metrics. """ if self.stddata: return self.stddata for data_type, data_info in self.results.items(): tfm = transformer if transformer is not None else data_info["transformer"] prefix = data_type if self.prefix: prefix = "{}-{}".format(self.prefix, data_type) # Primarily used to store the transformer used on the data # so that it can also be used for generating things # like summary values for suites, and subtests. self.ptnb_config[data_type] = { "output": self.output_path, "prefix": prefix, "custom_transformer": tfm, "file_groups": {data_type: data_info["files"]}, } ptnb = PerftestETL( file_groups=self.ptnb_config[data_type]["file_groups"], config=self.ptnb_config[data_type], prefix=self.prefix, logger=self.logger, custom_transform=tfm, ) r = ptnb.process(**data_info["options"]) self.stddata[data_type] = r["data"] return self.stddata def filtered_metrics( self, group_name="firefox", transformer=None, metrics=None, exclude=None, split_by=None, simplify_names=False, simplify_exclude=["statistics"], ): """Filters the metrics to only those that were requested by `metrics`. If metrics is Falsey (None, empty list, etc.) then no metrics will be filtered. The entries in metrics are pattern matched with the subtests in the standardized data (not a regular expression). For example, if "firstPaint" is in metrics, then all subtests which contain this string in their name will be kept. :param metrics list: List of metrics to keep. :param exclude list: List of string matchers to exclude from the metrics gathered/reported. :param split_by str: The name of a metric to use to split up data by. :param simplify_exclude list: List of string matchers to exclude from the naming simplification process. :return dict: Standardized notebook data containing the requested metrics. """ results = self.get_standardized_data( group_name=group_name, transformer=transformer ) if not metrics: return results if not exclude: exclude = [] if not simplify_exclude: simplify_exclude = [] # Get the field to split the results by (if any) if split_by is not None: splitting_entry = None for data_type, data_info in results.items(): for res in data_info: if split_by in res["subtest"]: splitting_entry = res break if splitting_entry is not None: split_by = defaultdict(list) for c, entry in enumerate(splitting_entry["data"]): split_by[entry["value"]].append(c) # Filter metrics filtered = {} for data_type, data_info in results.items(): newresults = [] for res in data_info: if any([met["name"] in res["subtest"] for met in metrics]) and not any( [met in res["subtest"] for met in exclude] ): res["transformer"] = self.ptnb_config[data_type][ "custom_transformer" ] newresults.append(res) filtered[data_type] = newresults # Simplify the filtered metric names if simplify_names: def _simplify(name): if any([met in name for met in simplify_exclude]): return None return name.split(".")[-1] self._alter_name(filtered, res, filter=_simplify) # Split the filtered results if split_by is not None: newfilt = {} total_iterations = sum([len(inds) for _, inds in split_by.items()]) for data_type in filtered: if not filtered[data_type]: # Ignore empty data types continue newresults = [] newfilt[data_type] = newresults for split, indices in split_by.items(): for res in filtered[data_type]: if len(res["data"]) != total_iterations: # Skip data that cannot be split continue splitres = {key: val for key, val in res.items()} splitres["subtest"] += " " + split splitres["data"] = [res["data"][i] for i in indices] splitres["transformer"] = self.ptnb_config[data_type][ "custom_transformer" ] newresults.append(splitres) filtered = newfilt return filtered def _alter_name(self, filtered, res, filter): previous = [] for data_type, data_info in filtered.items(): for res in data_info: new = filter(res["subtest"]) if new is None: continue if new in previous: self.logger.warning( f"Another metric which ends with `{new}` was already found. " f"{res['subtest']} will not be simplified." ) continue res["subtest"] = new previous.append(new) _metrics = {} def filtered_metrics( metadata, path, prefix, group_name="firefox", transformer=None, metrics=None, settings=False, exclude=None, split_by=None, simplify_names=False, simplify_exclude=["statistics"], ): """Returns standardized data extracted from the metadata instance. We're caching an instance of MetricsStorage per metrics/storage combination and compute the data only once when this function is called. """ key = path, prefix if key not in _metrics: storage = _metrics[key] = MetricsStorage(path, prefix, metadata) storage.set_results(metadata.get_results()) else: storage = _metrics[key] results = storage.filtered_metrics( group_name=group_name, transformer=transformer, metrics=metrics, exclude=exclude, split_by=split_by, simplify_names=simplify_names, simplify_exclude=simplify_exclude, ) # XXX returning two different types is a problem # in case settings is false, we should return None for it # and always return a 2-tuple if settings: return results, storage.settings return results