diff options
Diffstat (limited to 'tools/tryselect/util')
-rw-r--r-- | tools/tryselect/util/__init__.py | 3 | ||||
-rw-r--r-- | tools/tryselect/util/dicttools.py | 51 | ||||
-rw-r--r-- | tools/tryselect/util/estimates.py | 127 | ||||
-rw-r--r-- | tools/tryselect/util/manage_estimates.py | 131 |
4 files changed, 312 insertions, 0 deletions
diff --git a/tools/tryselect/util/__init__.py b/tools/tryselect/util/__init__.py new file mode 100644 index 0000000000..c580d191c1 --- /dev/null +++ b/tools/tryselect/util/__init__.py @@ -0,0 +1,3 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. diff --git a/tools/tryselect/util/dicttools.py b/tools/tryselect/util/dicttools.py new file mode 100644 index 0000000000..be4b18e618 --- /dev/null +++ b/tools/tryselect/util/dicttools.py @@ -0,0 +1,51 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from __future__ import absolute_import, print_function, unicode_literals + +import copy + + +def merge_to(source, dest): + """ + Merge dict and arrays (override scalar values) + + Keys from source override keys from dest, and elements from lists in source + are appended to lists in dest. + + :param dict source: to copy from + :param dict dest: to copy to (modified in place) + """ + + for key, value in source.items(): + # Override mismatching or empty types + if type(value) != type(dest.get(key)): # noqa + dest[key] = source[key] + continue + + # Merge dict + if isinstance(value, dict): + merge_to(value, dest[key]) + continue + + if isinstance(value, list): + dest[key] = dest[key] + source[key] + continue + + dest[key] = source[key] + + return dest + + +def merge(*objects): + """ + Merge the given objects, using the semantics described for merge_to, with + objects later in the list taking precedence. From an inheritance + perspective, "parents" should be listed before "children". + + Returns the result without modifying any arguments. + """ + if len(objects) == 1: + return copy.deepcopy(objects[0]) + return merge_to(objects[-1], merge(*objects[:-1])) diff --git a/tools/tryselect/util/estimates.py b/tools/tryselect/util/estimates.py new file mode 100644 index 0000000000..fa5f69fee1 --- /dev/null +++ b/tools/tryselect/util/estimates.py @@ -0,0 +1,127 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from __future__ import absolute_import, print_function + +import os +import json +from datetime import datetime, timedelta + +TASK_DURATION_CACHE = "task_duration_history.json" +GRAPH_QUANTILE_CACHE = "graph_quantile_cache.csv" +TASK_DURATION_TAG_FILE = "task_duration_tag.json" + + +def find_all_dependencies(graph, tasklist): + all_dependencies = dict() + + def find_dependencies(task): + dependencies = set() + if task in all_dependencies: + return all_dependencies[task] + if task not in graph: + # Don't add tasks (and so durations) for + # things optimised out. + return dependencies + dependencies.add(task) + for dep in graph.get(task, list()): + all_dependencies[dep] = find_dependencies(dep) + dependencies.update(all_dependencies[dep]) + return dependencies + + full_deps = set() + for task in tasklist: + full_deps.update(find_dependencies(task)) + + # Since these have been asked for, they're not inherited dependencies. + return sorted(full_deps - set(tasklist)) + + +def find_longest_path(graph, tasklist, duration_data): + + dep_durations = dict() + + def find_dependency_durations(task): + if task in dep_durations: + return dep_durations[task] + + durations = [find_dependency_durations(dep) for dep in graph.get(task, list())] + durations.append(0.0) + md = max(durations) + duration_data.get(task, 0.0) + dep_durations[task] = md + return md + + longest_paths = [find_dependency_durations(task) for task in tasklist] + # Default in case there are no tasks + if longest_paths: + return max(longest_paths) + else: + return 0 + + +def determine_quantile(quantiles_file, duration): + + duration = duration.total_seconds() + + with open(quantiles_file) as f: + f.readline() # skip header + boundaries = [float(l.strip()) for l in f.readlines()] + boundaries.sort() + + for i, v in enumerate(boundaries): + if duration < v: + break + # In case we weren't given 100 elements + return int(100 * i / len(boundaries)) + + +def task_duration_data(cache_dir): + with open(os.path.join(cache_dir, TASK_DURATION_CACHE)) as f: + return json.load(f) + + +def duration_summary(graph_cache_file, tasklist, cache_dir): + durations = task_duration_data(cache_dir) + + graph = dict() + if graph_cache_file: + with open(graph_cache_file) as f: + graph = json.load(f) + dependencies = find_all_dependencies(graph, tasklist) + longest_path = find_longest_path(graph, tasklist, durations) + dependency_duration = 0.0 + for task in dependencies: + dependency_duration += int(durations.get(task, 0.0)) + + total_requested_duration = 0.0 + for task in tasklist: + duration = int(durations.get(task, 0.0)) + total_requested_duration += duration + output = dict() + + total_requested_duration = timedelta(seconds=total_requested_duration) + total_dependency_duration = timedelta(seconds=dependency_duration) + + output["selected_duration"] = total_requested_duration + output["dependency_duration"] = total_dependency_duration + output["dependency_count"] = len(dependencies) + output["selected_count"] = len(tasklist) + + quantile = None + graph_quantile_cache = os.path.join(cache_dir, GRAPH_QUANTILE_CACHE) + if os.path.isfile(graph_quantile_cache): + quantile = 100 - determine_quantile( + graph_quantile_cache, total_dependency_duration + total_requested_duration + ) + if quantile: + output["quantile"] = quantile + + output["wall_duration_seconds"] = timedelta(seconds=int(longest_path)) + output["eta_datetime"] = datetime.now() + timedelta(seconds=longest_path) + + output["task_durations"] = { + task: int(durations.get(task, 0.0)) for task in tasklist + } + + return output diff --git a/tools/tryselect/util/manage_estimates.py b/tools/tryselect/util/manage_estimates.py new file mode 100644 index 0000000000..ce0ca0979d --- /dev/null +++ b/tools/tryselect/util/manage_estimates.py @@ -0,0 +1,131 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from __future__ import absolute_import, print_function + +import os +import requests +import json +from datetime import datetime, timedelta +import six + + +TASK_DURATION_URL = ( + "https://storage.googleapis.com/mozilla-mach-data/task_duration_history.json" +) +GRAPH_QUANTILES_URL = ( + "https://storage.googleapis.com/mozilla-mach-data/machtry_quantiles.csv" +) +from .estimates import TASK_DURATION_CACHE, GRAPH_QUANTILE_CACHE, TASK_DURATION_TAG_FILE + + +def check_downloaded_history(tag_file, duration_cache, quantile_cache): + if not os.path.isfile(tag_file): + return False + + try: + with open(tag_file) as f: + duration_tags = json.load(f) + download_date = datetime.strptime( + duration_tags.get("download_date"), "%Y-%M-%d" + ) + if download_date < datetime.now() - timedelta(days=7): + return False + except (IOError, ValueError): + return False + + if not os.path.isfile(duration_cache): + return False + # Check for old format version of file. + with open(duration_cache) as f: + data = json.load(f) + if isinstance(data, list): + return False + if not os.path.isfile(quantile_cache): + return False + + return True + + +def download_task_history_data(cache_dir): + """Fetch task duration data exported from BigQuery.""" + task_duration_cache = os.path.join(cache_dir, TASK_DURATION_CACHE) + task_duration_tag_file = os.path.join(cache_dir, TASK_DURATION_TAG_FILE) + graph_quantile_cache = os.path.join(cache_dir, GRAPH_QUANTILE_CACHE) + + if check_downloaded_history( + task_duration_tag_file, task_duration_cache, graph_quantile_cache + ): + return + + try: + os.unlink(task_duration_tag_file) + os.unlink(task_duration_cache) + os.unlink(graph_quantile_cache) + except OSError: + print("No existing task history to clean up.") + + try: + r = requests.get(TASK_DURATION_URL, stream=True) + except requests.exceptions.RequestException as exc: + # This is fine, the durations just won't be in the preview window. + print( + "Error fetching task duration cache from {}: {}".format( + TASK_DURATION_URL, exc + ) + ) + return + + # The data retrieved from google storage is a newline-separated + # list of json entries, which Python's json module can't parse. + duration_data = list() + for line in r.text.splitlines(): + duration_data.append(json.loads(line)) + + # Reformat duration data to avoid list of dicts, as this is slow in the preview window + duration_data = {d["name"]: d["mean_duration_seconds"] for d in duration_data} + + with open(task_duration_cache, "w") as f: + json.dump(duration_data, f, indent=4) + + try: + r = requests.get(GRAPH_QUANTILES_URL, stream=True) + except requests.exceptions.RequestException as exc: + # This is fine, the percentile just won't be in the preview window. + print( + "Error fetching task group percentiles from {}: {}".format( + GRAPH_QUANTILES_URL, exc + ) + ) + return + + with open(graph_quantile_cache, "w") as f: + f.write(six.ensure_text(r.content)) + + with open(task_duration_tag_file, "w") as f: + json.dump({"download_date": datetime.now().strftime("%Y-%m-%d")}, f, indent=4) + + +def make_trimmed_taskgraph_cache(graph_cache, dep_cache, target_file=None): + """Trim the taskgraph cache used for dependencies. + + Speeds up the fzf preview window to less human-perceptible + ranges.""" + if not os.path.isfile(graph_cache): + return + + target_task_set = set() + if target_file and os.path.isfile(target_file): + with open(target_file) as f: + target_task_set = set(json.load(f).keys()) + + with open(graph_cache) as f: + graph = json.load(f) + graph = { + name: list(defn["dependencies"].values()) + for name, defn in graph.items() + if name in target_task_set + } + with open(dep_cache, "w") as f: + json.dump(graph, f, indent=4) |