diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/python/taskcluster_taskgraph/taskgraph | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/python/taskcluster_taskgraph/taskgraph')
74 files changed, 15455 insertions, 0 deletions
diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/__init__.py new file mode 100644 index 0000000000..81cc763230 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/__init__.py @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +__version__ = "6.3.0" + +# Maximum number of dependencies a single task can have +# https://docs.taskcluster.net/reference/platform/taskcluster-queue/references/api#createTask +# specifies 100, but we also optionally add the decision task id as a dep in +# taskgraph.create, so let's set this to 99. +MAX_DEPENDENCIES = 99 + +# Enable fast task generation for local debugging +# This is normally switched on via the --fast/-F flag to `mach taskgraph` +# Currently this skips toolchain task optimizations and schema validation +fast = False diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/__init__.py new file mode 100644 index 0000000000..590a957282 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/__init__.py @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +from .registry import ( + register_callback_action, + render_actions_json, + trigger_action_callback, +) + +__all__ = [ + "register_callback_action", + "render_actions_json", + "trigger_action_callback", +] diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/add_new_jobs.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/add_new_jobs.py new file mode 100644 index 0000000000..c5e1821546 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/add_new_jobs.py @@ -0,0 +1,64 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +from taskgraph.actions.registry import register_callback_action +from taskgraph.actions.util import ( + combine_task_graph_files, + create_tasks, + fetch_graph_and_labels, +) + + +@register_callback_action( + name="add-new-jobs", + title="Add new jobs", + generic=True, + symbol="add-new", + description="Add new jobs using task labels.", + order=100, + context=[], + schema={ + "type": "object", + "properties": { + "tasks": { + "type": "array", + "description": "An array of task labels", + "items": {"type": "string"}, + }, + "times": { + "type": "integer", + "default": 1, + "minimum": 1, + "maximum": 100, + "title": "Times", + "description": "How many times to run each task.", + }, + }, + }, +) +def add_new_jobs_action(parameters, graph_config, input, task_group_id, task_id): + decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels( + parameters, graph_config + ) + + to_run = [] + for elem in input["tasks"]: + if elem in full_task_graph.tasks: + to_run.append(elem) + else: + raise Exception(f"{elem} was not found in the task-graph") + + times = input.get("times", 1) + for i in range(times): + create_tasks( + graph_config, + to_run, + full_task_graph, + label_to_taskid, + parameters, + decision_task_id, + f"{i}", + ) + combine_task_graph_files(list(range(times))) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/cancel.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/cancel.py new file mode 100644 index 0000000000..03788c6538 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/cancel.py @@ -0,0 +1,42 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging + +import requests + +from taskgraph.util.taskcluster import cancel_task + +from .registry import register_callback_action + +logger = logging.getLogger(__name__) + + +@register_callback_action( + title="Cancel Task", + name="cancel", + symbol="cx", + generic=True, + description=("Cancel the given task"), + order=350, + context=[{}], +) +def cancel_action(parameters, graph_config, input, task_group_id, task_id): + # Note that this is limited by the scopes afforded to generic actions to + # only cancel tasks with the level-specific schedulerId. + try: + cancel_task(task_id, use_proxy=True) + except requests.HTTPError as e: + if e.response.status_code == 409: + # A 409 response indicates that this task is past its deadline. It + # cannot be cancelled at this time, but it's also not running + # anymore, so we can ignore this error. + logger.info( + 'Task "{}" is past its deadline and cannot be cancelled.'.format( + task_id + ) + ) + return + raise diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/cancel_all.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/cancel_all.py new file mode 100644 index 0000000000..d3e0440839 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/cancel_all.py @@ -0,0 +1,61 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import os +from concurrent import futures + +import requests + +from taskgraph.util.taskcluster import ( + CONCURRENCY, + cancel_task, + list_task_group_incomplete_tasks, +) + +from .registry import register_callback_action + +logger = logging.getLogger(__name__) + + +@register_callback_action( + title="Cancel All", + name="cancel-all", + generic=True, + symbol="cAll", + description=( + "Cancel all running and pending tasks created by the decision task " + "this action task is associated with." + ), + order=400, + context=[], +) +def cancel_all_action(parameters, graph_config, input, task_group_id, task_id): + def do_cancel_task(task_id): + logger.info(f"Cancelling task {task_id}") + try: + cancel_task(task_id, use_proxy=True) + except requests.HTTPError as e: + if e.response.status_code == 409: + # A 409 response indicates that this task is past its deadline. It + # cannot be cancelled at this time, but it's also not running + # anymore, so we can ignore this error. + logger.info( + "Task {} is past its deadline and cannot be cancelled.".format( + task_id + ) + ) + return + raise + + own_task_id = os.environ.get("TASK_ID", "") + to_cancel = [ + t for t in list_task_group_incomplete_tasks(task_group_id) if t != own_task_id + ] + logger.info(f"Cancelling {len(to_cancel)} tasks") + with futures.ThreadPoolExecutor(CONCURRENCY) as e: + cancel_futs = [e.submit(do_cancel_task, t) for t in to_cancel] + for f in futures.as_completed(cancel_futs): + f.result() diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/rebuild_cached_tasks.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/rebuild_cached_tasks.py new file mode 100644 index 0000000000..2b88e6a698 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/rebuild_cached_tasks.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from .registry import register_callback_action +from .util import create_tasks, fetch_graph_and_labels + + +@register_callback_action( + name="rebuild-cached-tasks", + title="Rebuild Cached Tasks", + symbol="rebuild-cached", + description="Rebuild cached tasks.", + order=1000, + context=[], +) +def rebuild_cached_tasks_action( + parameters, graph_config, input, task_group_id, task_id +): + decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels( + parameters, graph_config + ) + cached_tasks = [ + label + for label, task in full_task_graph.tasks.items() + if task.attributes.get("cached_task", False) + ] + if cached_tasks: + create_tasks( + graph_config, + cached_tasks, + full_task_graph, + label_to_taskid, + parameters, + decision_task_id, + ) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/registry.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/registry.py new file mode 100644 index 0000000000..1e909d30c7 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/registry.py @@ -0,0 +1,352 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import json +from collections import namedtuple +from types import FunctionType + +from mozilla_repo_urls import parse + +from taskgraph import create +from taskgraph.config import load_graph_config +from taskgraph.parameters import Parameters +from taskgraph.util import hash, taskcluster, yaml +from taskgraph.util.memoize import memoize +from taskgraph.util.python_path import import_sibling_modules + +actions = [] +callbacks = {} + +Action = namedtuple("Action", ["order", "cb_name", "generic", "action_builder"]) + + +def is_json(data): + """Return ``True``, if ``data`` is a JSON serializable data structure.""" + try: + json.dumps(data) + except ValueError: + return False + return True + + +@memoize +def read_taskcluster_yml(filename): + """Load and parse .taskcluster.yml, memoized to save some time""" + return yaml.load_yaml(filename) + + +@memoize +def hash_taskcluster_yml(filename): + """ + Generate a hash of the given .taskcluster.yml. This is the first 10 digits + of the sha256 of the file's content, and is used by administrative scripts + to create a hook based on this content. + """ + return hash.hash_path(filename)[:10] + + +def register_callback_action( + name, + title, + symbol, + description, + order=10000, + context=[], + available=lambda parameters: True, + schema=None, + generic=True, + cb_name=None, +): + """ + Register an action callback that can be triggered from supporting + user interfaces, such as Treeherder. + + This function is to be used as a decorator for a callback that takes + parameters as follows: + + ``parameters``: + Decision task :class:`parameters <taskgraph.parameters.Parameters>`. + ``input``: + Input matching specified JSON schema, ``None`` if no ``schema`` + parameter is given to ``register_callback_action``. + ``task_group_id``: + The id of the task-group this was triggered for. + ``task_id`` and `task``: + task identifier and task definition for task the action was triggered + for, ``None`` if no ``context`` parameters was given to + ``register_callback_action``. + + Args: + name (str): + An identifier for this action, used by UIs to find the action. + title (str): + A human readable title for the action to be used as label on a button + or text on a link for triggering the action. + symbol (str): + Treeherder symbol for the action callback, this is the symbol that the + task calling your callback will be displayed as. This is usually 1-3 + letters abbreviating the action title. + description (str): + A human readable description of the action in **markdown**. + This will be display as tooltip and in dialog window when the action + is triggered. This is a good place to describe how to use the action. + order (int): + Order of the action in menus, this is relative to the ``order`` of + other actions declared. + context (list of dict): + List of tag-sets specifying which tasks the action is can take as input. + If no tag-sets is specified as input the action is related to the + entire task-group, and won't be triggered with a given task. + + Otherwise, if ``context = [{'k': 'b', 'p': 'l'}, {'k': 't'}]`` will only + be displayed in the context menu for tasks that has + ``task.tags.k == 'b' && task.tags.p = 'l'`` or ``task.tags.k = 't'``. + Essentially, this allows filtering on ``task.tags``. + + If this is a function, it is given the decision parameters and must return + a value of the form described above. + available (function): + An optional function that given decision parameters decides if the + action is available. Defaults to a function that always returns ``True``. + schema (dict): + JSON schema specifying input accepted by the action. + This is optional and can be left ``null`` if no input is taken. + generic (bool) + Whether this is a generic action or has its own permissions. + cb_name (str): + The name under which this function should be registered, defaulting to + `name`. This is used to generation actionPerm for non-generic hook + actions, and thus appears in ci-configuration and various role and hook + names. Unlike `name`, which can appear multiple times, cb_name must be + unique among all registered callbacks. + + Returns: + function: Decorator to be used for the callback function. + """ + mem = {"registered": False} # workaround nonlocal missing in 2.x + + assert isinstance(title, str), "title must be a string" + assert isinstance(description, str), "description must be a string" + title = title.strip() + description = description.strip() + + # ensure that context is callable + if not callable(context): + context_value = context + context = lambda params: context_value # noqa + + def register_callback(cb, cb_name=cb_name): + assert isinstance(name, str), "name must be a string" + assert isinstance(order, int), "order must be an integer" + assert callable(schema) or is_json( + schema + ), "schema must be a JSON compatible object" + assert isinstance(cb, FunctionType), "callback must be a function" + # Allow for json-e > 25 chars in the symbol. + if "$" not in symbol: + assert 1 <= len(symbol) <= 25, "symbol must be between 1 and 25 characters" + assert isinstance(symbol, str), "symbol must be a string" + + assert not mem[ + "registered" + ], "register_callback_action must be used as decorator" + if not cb_name: + cb_name = name + assert cb_name not in callbacks, "callback name {} is not unique".format( + cb_name + ) + + def action_builder(parameters, graph_config, decision_task_id): + if not available(parameters): + return None + + actionPerm = "generic" if generic else cb_name + + # gather up the common decision-task-supplied data for this action + repo_param = "head_repository" + repository = { + "url": parameters[repo_param], + "project": parameters["project"], + "level": parameters["level"], + } + + revision = parameters["head_rev"] + push = { + "owner": "mozilla-taskcluster-maintenance@mozilla.com", + "pushlog_id": parameters["pushlog_id"], + "revision": revision, + } + branch = parameters.get("head_ref") + if branch: + push["branch"] = branch + + action = { + "name": name, + "title": title, + "description": description, + # target taskGroupId (the task group this decision task is creating) + "taskGroupId": decision_task_id, + "cb_name": cb_name, + "symbol": symbol, + } + + rv = { + "name": name, + "title": title, + "description": description, + "context": context(parameters), + } + if schema: + rv["schema"] = ( + schema(graph_config=graph_config) if callable(schema) else schema + ) + + trustDomain = graph_config["trust-domain"] + level = parameters["level"] + tcyml_hash = hash_taskcluster_yml(graph_config.taskcluster_yml) + + # the tcyml_hash is prefixed with `/` in the hookId, so users will be granted + # hooks:trigger-hook:project-gecko/in-tree-action-3-myaction/*; if another + # action was named `myaction/release`, then the `*` in the scope would also + # match that action. To prevent such an accident, we prohibit `/` in hook + # names. + if "/" in actionPerm: + raise Exception("`/` is not allowed in action names; use `-`") + + rv.update( + { + "kind": "hook", + "hookGroupId": f"project-{trustDomain}", + "hookId": "in-tree-action-{}-{}/{}".format( + level, actionPerm, tcyml_hash + ), + "hookPayload": { + # provide the decision-task parameters as context for triggerHook + "decision": { + "action": action, + "repository": repository, + "push": push, + }, + # and pass everything else through from our own context + "user": { + "input": {"$eval": "input"}, + "taskId": {"$eval": "taskId"}, # target taskId (or null) + "taskGroupId": { + "$eval": "taskGroupId" + }, # target task group + }, + }, + "extra": { + "actionPerm": actionPerm, + }, + } + ) + + return rv + + actions.append(Action(order, cb_name, generic, action_builder)) + + mem["registered"] = True + callbacks[cb_name] = cb + return cb + + return register_callback + + +def render_actions_json(parameters, graph_config, decision_task_id): + """ + Render JSON object for the ``public/actions.json`` artifact. + + Args: + parameters (:class:`~taskgraph.parameters.Parameters`): + Decision task parameters. + + Returns: + dict: + JSON object representation of the ``public/actions.json`` + artifact. + """ + assert isinstance(parameters, Parameters), "requires instance of Parameters" + actions = [] + for action in sorted(_get_actions(graph_config), key=lambda action: action.order): + action = action.action_builder(parameters, graph_config, decision_task_id) + if action: + assert is_json(action), "action must be a JSON compatible object" + actions.append(action) + return { + "version": 1, + "variables": {}, + "actions": actions, + } + + +def sanity_check_task_scope(callback, parameters, graph_config): + """ + If this action is not generic, then verify that this task has the necessary + scope to run the action. This serves as a backstop preventing abuse by + running non-generic actions using generic hooks. While scopes should + prevent serious damage from such abuse, it's never a valid thing to do. + """ + for action in _get_actions(graph_config): + if action.cb_name == callback: + break + else: + raise ValueError(f"No action with cb_name {callback}") + + actionPerm = "generic" if action.generic else action.cb_name + + repo_param = "head_repository" + raw_url = parameters[repo_param] + parsed_url = parse(raw_url) + expected_scope = f"assume:{parsed_url.taskcluster_role_prefix}:action:{actionPerm}" + + # the scope should appear literally; no need for a satisfaction check. The use of + # get_current_scopes here calls the auth service through the Taskcluster Proxy, giving + # the precise scopes available to this task. + if expected_scope not in taskcluster.get_current_scopes(): + raise ValueError(f"Expected task scope {expected_scope} for this action") + + +def trigger_action_callback( + task_group_id, task_id, input, callback, parameters, root, test=False +): + """ + Trigger action callback with the given inputs. If `test` is true, then run + the action callback in testing mode, without actually creating tasks. + """ + graph_config = load_graph_config(root) + graph_config.register() + callbacks = _get_callbacks(graph_config) + cb = callbacks.get(callback, None) + if not cb: + raise Exception( + "Unknown callback: {}. Known callbacks: {}".format( + callback, ", ".join(callbacks) + ) + ) + + if test: + create.testing = True + taskcluster.testing = True + + if not test: + sanity_check_task_scope(callback, parameters, graph_config) + + cb(Parameters(**parameters), graph_config, input, task_group_id, task_id) + + +def _load(graph_config): + # Load all modules from this folder, relying on the side-effects of register_ + # functions to populate the action registry. + import_sibling_modules(exceptions=("util.py",)) + return callbacks, actions + + +def _get_callbacks(graph_config): + return _load(graph_config)[0] + + +def _get_actions(graph_config): + return _load(graph_config)[1] diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/retrigger.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/retrigger.py new file mode 100644 index 0000000000..fd488b35fc --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/retrigger.py @@ -0,0 +1,301 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import sys +import textwrap + +from slugid import nice as slugid + +from taskgraph.util import taskcluster + +from .registry import register_callback_action +from .util import ( + combine_task_graph_files, + create_task_from_def, + create_tasks, + fetch_graph_and_labels, + relativize_datestamps, +) + +logger = logging.getLogger(__name__) + +RERUN_STATES = ("exception", "failed") + + +def _should_retrigger(task_graph, label): + """ + Return whether a given task in the taskgraph should be retriggered. + + This handles the case where the task isn't there by assuming it should not be. + """ + if label not in task_graph: + logger.info( + "Task {} not in full taskgraph, assuming task should not be retriggered.".format( + label + ) + ) + return False + return task_graph[label].attributes.get("retrigger", False) + + +@register_callback_action( + title="Retrigger", + name="retrigger", + symbol="rt", + cb_name="retrigger-decision", + description=textwrap.dedent( + """\ + Create a clone of the task (retriggering decision, action, and cron tasks requires + special scopes).""" + ), + order=11, + context=[ + {"kind": "decision-task"}, + {"kind": "action-callback"}, + {"kind": "cron-task"}, + ], +) +def retrigger_decision_action(parameters, graph_config, input, task_group_id, task_id): + """For a single task, we try to just run exactly the same task once more. + It's quite possible that we don't have the scopes to do so (especially for + an action), but this is best-effort.""" + + # make all of the timestamps relative; they will then be turned back into + # absolute timestamps relative to the current time. + task = taskcluster.get_task_definition(task_id) + task = relativize_datestamps(task) + create_task_from_def(slugid(), task, parameters["level"]) + + +@register_callback_action( + title="Retrigger", + name="retrigger", + symbol="rt", + generic=True, + description=("Create a clone of the task."), + order=19, # must be greater than other orders in this file, as this is the fallback version + context=[{"retrigger": "true"}], + schema={ + "type": "object", + "properties": { + "downstream": { + "type": "boolean", + "description": ( + "If true, downstream tasks from this one will be cloned as well. " + "The dependencies will be updated to work with the new task at the root." + ), + "default": False, + }, + "times": { + "type": "integer", + "default": 1, + "minimum": 1, + "maximum": 100, + "title": "Times", + "description": "How many times to run each task.", + }, + }, + }, +) +@register_callback_action( + title="Retrigger (disabled)", + name="retrigger", + cb_name="retrigger-disabled", + symbol="rt", + generic=True, + description=( + "Create a clone of the task.\n\n" + "This type of task should typically be re-run instead of re-triggered." + ), + order=20, # must be greater than other orders in this file, as this is the fallback version + context=[{}], + schema={ + "type": "object", + "properties": { + "downstream": { + "type": "boolean", + "description": ( + "If true, downstream tasks from this one will be cloned as well. " + "The dependencies will be updated to work with the new task at the root." + ), + "default": False, + }, + "times": { + "type": "integer", + "default": 1, + "minimum": 1, + "maximum": 100, + "title": "Times", + "description": "How many times to run each task.", + }, + "force": { + "type": "boolean", + "default": False, + "description": ( + "This task should not be re-triggered. " + "This can be overridden by passing `true` here." + ), + }, + }, + }, +) +def retrigger_action(parameters, graph_config, input, task_group_id, task_id): + decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels( + parameters, graph_config + ) + + task = taskcluster.get_task_definition(task_id) + label = task["metadata"]["name"] + + with_downstream = " " + to_run = [label] + + if not input.get("force", None) and not _should_retrigger(full_task_graph, label): + logger.info( + "Not retriggering task {}, task should not be retrigged " + "and force not specified.".format(label) + ) + sys.exit(1) + + if input.get("downstream"): + to_run = full_task_graph.graph.transitive_closure( + set(to_run), reverse=True + ).nodes + to_run = to_run & set(label_to_taskid.keys()) + with_downstream = " (with downstream) " + + times = input.get("times", 1) + for i in range(times): + create_tasks( + graph_config, + to_run, + full_task_graph, + label_to_taskid, + parameters, + decision_task_id, + f"{i}", + ) + + logger.info(f"Scheduled {label}{with_downstream}(time {i + 1}/{times})") + combine_task_graph_files(list(range(times))) + + +@register_callback_action( + title="Rerun", + name="rerun", + generic=True, + symbol="rr", + description=( + "Rerun a task.\n\n" + "This only works on failed or exception tasks in the original taskgraph," + " and is CoT friendly." + ), + order=300, + context=[{}], + schema={"type": "object", "properties": {}}, +) +def rerun_action(parameters, graph_config, input, task_group_id, task_id): + task = taskcluster.get_task_definition(task_id) + parameters = dict(parameters) + decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels( + parameters, graph_config + ) + label = task["metadata"]["name"] + if task_id not in label_to_taskid.values(): + logger.error( + "Refusing to rerun {}: taskId {} not in decision task {} label_to_taskid!".format( + label, task_id, decision_task_id + ) + ) + + _rerun_task(task_id, label) + + +def _rerun_task(task_id, label): + state = taskcluster.state_task(task_id) + if state not in RERUN_STATES: + logger.warning( + "No need to rerun {}: state '{}' not in {}!".format( + label, state, RERUN_STATES + ) + ) + return + taskcluster.rerun_task(task_id) + logger.info(f"Reran {label}") + + +@register_callback_action( + title="Retrigger", + name="retrigger-multiple", + symbol="rt", + generic=True, + description=("Create a clone of the task."), + context=[], + schema={ + "type": "object", + "properties": { + "requests": { + "type": "array", + "items": { + "tasks": { + "type": "array", + "description": "An array of task labels", + "items": {"type": "string"}, + }, + "times": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "title": "Times", + "description": "How many times to run each task.", + }, + "additionalProperties": False, + }, + }, + "additionalProperties": False, + }, + }, +) +def retrigger_multiple(parameters, graph_config, input, task_group_id, task_id): + decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels( + parameters, graph_config + ) + + suffixes = [] + for i, request in enumerate(input.get("requests", [])): + times = request.get("times", 1) + rerun_tasks = [ + label + for label in request.get("tasks") + if not _should_retrigger(full_task_graph, label) + ] + retrigger_tasks = [ + label + for label in request.get("tasks") + if _should_retrigger(full_task_graph, label) + ] + + for label in rerun_tasks: + # XXX we should not re-run tasks pulled in from other pushes + # In practice, this shouldn't matter, as only completed tasks + # are pulled in from other pushes and treeherder won't pass + # those labels. + _rerun_task(label_to_taskid[label], label) + + for j in range(times): + suffix = f"{i}-{j}" + suffixes.append(suffix) + create_tasks( + graph_config, + retrigger_tasks, + full_task_graph, + label_to_taskid, + parameters, + decision_task_id, + suffix, + ) + + combine_task_graph_files(suffixes) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/actions/util.py b/third_party/python/taskcluster_taskgraph/taskgraph/actions/util.py new file mode 100644 index 0000000000..cf81029da2 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/actions/util.py @@ -0,0 +1,282 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import copy +import logging +import os +import re +from concurrent import futures +from functools import reduce + +from requests.exceptions import HTTPError + +from taskgraph import create +from taskgraph.decision import read_artifact, rename_artifact, write_artifact +from taskgraph.optimize.base import optimize_task_graph +from taskgraph.taskgraph import TaskGraph +from taskgraph.util.taskcluster import ( + CONCURRENCY, + get_artifact, + get_session, + list_tasks, + parse_time, +) +from taskgraph.util.taskgraph import find_decision_task + +logger = logging.getLogger(__name__) + + +def get_parameters(decision_task_id): + return get_artifact(decision_task_id, "public/parameters.yml") + + +def fetch_graph_and_labels(parameters, graph_config): + decision_task_id = find_decision_task(parameters, graph_config) + + # First grab the graph and labels generated during the initial decision task + full_task_graph = get_artifact(decision_task_id, "public/full-task-graph.json") + _, full_task_graph = TaskGraph.from_json(full_task_graph) + label_to_taskid = get_artifact(decision_task_id, "public/label-to-taskid.json") + + # fetch everything in parallel; this avoids serializing any delay in downloading + # each artifact (such as waiting for the artifact to be mirrored locally) + with futures.ThreadPoolExecutor(CONCURRENCY) as e: + fetches = [] + + # fetch any modifications made by action tasks and swap out new tasks + # for old ones + def fetch_action(task_id): + logger.info(f"fetching label-to-taskid.json for action task {task_id}") + try: + run_label_to_id = get_artifact(task_id, "public/label-to-taskid.json") + label_to_taskid.update(run_label_to_id) + except HTTPError as e: + if e.response.status_code != 404: + raise + logger.debug(f"No label-to-taskid.json found for {task_id}: {e}") + + namespace = "{}.v2.{}.pushlog-id.{}.actions".format( + graph_config["trust-domain"], + parameters["project"], + parameters["pushlog_id"], + ) + for task_id in list_tasks(namespace): + fetches.append(e.submit(fetch_action, task_id)) + + # Similarly for cron tasks.. + def fetch_cron(task_id): + logger.info(f"fetching label-to-taskid.json for cron task {task_id}") + try: + run_label_to_id = get_artifact(task_id, "public/label-to-taskid.json") + label_to_taskid.update(run_label_to_id) + except HTTPError as e: + if e.response.status_code != 404: + raise + logger.debug(f"No label-to-taskid.json found for {task_id}: {e}") + + namespace = "{}.v2.{}.revision.{}.cron".format( + graph_config["trust-domain"], parameters["project"], parameters["head_rev"] + ) + for task_id in list_tasks(namespace): + fetches.append(e.submit(fetch_cron, task_id)) + + # now wait for each fetch to complete, raising an exception if there + # were any issues + for f in futures.as_completed(fetches): + f.result() + + return (decision_task_id, full_task_graph, label_to_taskid) + + +def create_task_from_def(task_id, task_def, level): + """Create a new task from a definition rather than from a label + that is already in the full-task-graph. The task definition will + have {relative-datestamp': '..'} rendered just like in a decision task. + Use this for entirely new tasks or ones that change internals of the task. + It is useful if you want to "edit" the full_task_graph and then hand + it to this function. No dependencies will be scheduled. You must handle + this yourself. Seeing how create_tasks handles it might prove helpful.""" + task_def["schedulerId"] = f"gecko-level-{level}" + label = task_def["metadata"]["name"] + session = get_session() + create.create_task(session, task_id, label, task_def) + + +def update_parent(task, graph): + task.task.setdefault("extra", {})["parent"] = os.environ.get("TASK_ID", "") + return task + + +def update_dependencies(task, graph): + if os.environ.get("TASK_ID"): + task.task.setdefault("dependencies", []).append(os.environ["TASK_ID"]) + return task + + +def create_tasks( + graph_config, + to_run, + full_task_graph, + label_to_taskid, + params, + decision_task_id=None, + suffix="", + modifier=lambda t: t, +): + """Create new tasks. The task definition will have {relative-datestamp': + '..'} rendered just like in a decision task. Action callbacks should use + this function to create new tasks, + allowing easy debugging with `mach taskgraph action-callback --test`. + This builds up all required tasks to run in order to run the tasks requested. + + Optionally this function takes a `modifier` function that is passed in each + task before it is put into a new graph. It should return a valid task. Note + that this is passed _all_ tasks in the graph, not just the set in to_run. You + may want to skip modifying tasks not in your to_run list. + + If `suffix` is given, then it is used to give unique names to the resulting + artifacts. If you call this function multiple times in the same action, + pass a different suffix each time to avoid overwriting artifacts. + + If you wish to create the tasks in a new group, leave out decision_task_id. + + Returns an updated label_to_taskid containing the new tasks""" + if suffix: + suffix = f"-{suffix}" + to_run = set(to_run) + + # Copy to avoid side-effects later + full_task_graph = copy.deepcopy(full_task_graph) + label_to_taskid = label_to_taskid.copy() + + target_graph = full_task_graph.graph.transitive_closure(to_run) + target_task_graph = TaskGraph( + {l: modifier(full_task_graph[l]) for l in target_graph.nodes}, target_graph + ) + target_task_graph.for_each_task(update_parent) + if decision_task_id and decision_task_id != os.environ.get("TASK_ID"): + target_task_graph.for_each_task(update_dependencies) + optimized_task_graph, label_to_taskid = optimize_task_graph( + target_task_graph, + to_run, + params, + to_run, + decision_task_id, + existing_tasks=label_to_taskid, + ) + write_artifact(f"task-graph{suffix}.json", optimized_task_graph.to_json()) + write_artifact(f"label-to-taskid{suffix}.json", label_to_taskid) + write_artifact(f"to-run{suffix}.json", list(to_run)) + create.create_tasks( + graph_config, + optimized_task_graph, + label_to_taskid, + params, + decision_task_id, + ) + return label_to_taskid + + +def _update_reducer(accumulator, new_value): + "similar to set or dict `update` method, but returning the modified object" + accumulator.update(new_value) + return accumulator + + +def combine_task_graph_files(suffixes): + """Combine task-graph-{suffix}.json files into a single task-graph.json file. + + Since Chain of Trust verification requires a task-graph.json file that + contains all children tasks, we can combine the various task-graph-0.json + type files into a master task-graph.json file at the end. + + Actions also look for various artifacts, so we combine those in a similar + fashion. + + In the case where there is only one suffix, we simply rename it to avoid the + additional cost of uploading two copies of the same data. + """ + + if len(suffixes) == 1: + for filename in ["task-graph", "label-to-taskid", "to-run"]: + rename_artifact(f"{filename}-{suffixes[0]}.json", f"{filename}.json") + return + + def combine(file_contents, base): + return reduce(_update_reducer, file_contents, base) + + files = [read_artifact(f"task-graph-{suffix}.json") for suffix in suffixes] + write_artifact("task-graph.json", combine(files, dict())) + + files = [read_artifact(f"label-to-taskid-{suffix}.json") for suffix in suffixes] + write_artifact("label-to-taskid.json", combine(files, dict())) + + files = [read_artifact(f"to-run-{suffix}.json") for suffix in suffixes] + write_artifact("to-run.json", list(combine(files, set()))) + + +def relativize_datestamps(task_def): + """ + Given a task definition as received from the queue, convert all datestamps + to {relative_datestamp: ..} format, with the task creation time as "now". + The result is useful for handing to ``create_task``. + """ + base = parse_time(task_def["created"]) + # borrowed from https://github.com/epoberezkin/ajv/blob/master/lib/compile/formats.js + ts_pattern = re.compile( + r"^\d\d\d\d-[0-1]\d-[0-3]\d[t\s]" + r"(?:[0-2]\d:[0-5]\d:[0-5]\d|23:59:60)(?:\.\d+)?" + r"(?:z|[+-]\d\d:\d\d)$", + re.I, + ) + + def recurse(value): + if isinstance(value, str): + if ts_pattern.match(value): + value = parse_time(value) + diff = value - base + return {"relative-datestamp": f"{int(diff.total_seconds())} seconds"} + if isinstance(value, list): + return [recurse(e) for e in value] + if isinstance(value, dict): + return {k: recurse(v) for k, v in value.items()} + return value + + return recurse(task_def) + + +def add_args_to_command(cmd_parts, extra_args=[]): + """ + Add custom command line args to a given command. + + Args: + cmd_parts: the raw command as seen by taskcluster + extra_args: array of args we want to add + """ + cmd_type = "default" + if len(cmd_parts) == 1 and isinstance(cmd_parts[0], dict): + # windows has single cmd part as dict: 'task-reference', with long string + cmd_parts = cmd_parts[0]["task-reference"].split(" ") + cmd_type = "dict" + elif len(cmd_parts) == 1 and ( + isinstance(cmd_parts[0], str) or isinstance(cmd_parts[0], str) + ): + # windows has single cmd part as a long string + cmd_parts = cmd_parts[0].split(" ") + cmd_type = "unicode" + elif len(cmd_parts) == 1 and isinstance(cmd_parts[0], list): + # osx has an single value array with an array inside + cmd_parts = cmd_parts[0] + cmd_type = "subarray" + + cmd_parts.extend(extra_args) + + if cmd_type == "dict": + cmd_parts = [{"task-reference": " ".join(cmd_parts)}] + elif cmd_type == "unicode": + cmd_parts = [" ".join(cmd_parts)] + elif cmd_type == "subarray": + cmd_parts = [cmd_parts] + return cmd_parts diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/config.py b/third_party/python/taskcluster_taskgraph/taskgraph/config.py new file mode 100644 index 0000000000..7ea7dc7b33 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/config.py @@ -0,0 +1,146 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import os +import sys +from dataclasses import dataclass +from typing import Dict + +from voluptuous import All, Any, Extra, Length, Optional, Required + +from .util import path +from .util.python_path import find_object +from .util.schema import Schema, optionally_keyed_by, validate_schema +from .util.yaml import load_yaml + +logger = logging.getLogger(__name__) + +graph_config_schema = Schema( + { + # The trust-domain for this graph. + # (See https://firefox-source-docs.mozilla.org/taskcluster/taskcluster/taskgraph.html#taskgraph-trust-domain) # noqa + Required("trust-domain"): str, + Required("task-priority"): optionally_keyed_by( + "project", + Any( + "highest", + "very-high", + "high", + "medium", + "low", + "very-low", + "lowest", + ), + ), + Optional( + "task-deadline-after", + description="Default 'deadline' for tasks, in relative date format. " + "Eg: '1 week'", + ): optionally_keyed_by("project", str), + Required("workers"): { + Required("aliases"): { + str: { + Required("provisioner"): optionally_keyed_by("level", str), + Required("implementation"): str, + Required("os"): str, + Required("worker-type"): optionally_keyed_by("level", str), + } + }, + }, + Required("taskgraph"): { + Optional( + "register", + description="Python function to call to register extensions.", + ): str, + Optional("decision-parameters"): str, + Optional( + "cached-task-prefix", + description="The taskcluster index prefix to use for caching tasks. " + "Defaults to `trust-domain`.", + ): str, + Optional( + "index-path-regexes", + description="Regular expressions matching index paths to be summarized.", + ): [str], + Required("repositories"): All( + { + str: { + Required("name"): str, + Optional("project-regex"): str, + Optional("ssh-secret-name"): str, + # FIXME + Extra: str, + } + }, + Length(min=1), + ), + }, + Extra: object, + } +) +"""Schema for GraphConfig""" + + +@dataclass(frozen=True, eq=False) +class GraphConfig: + _config: Dict + root_dir: str + + _PATH_MODIFIED = False + + def __getitem__(self, name): + return self._config[name] + + def __contains__(self, name): + return name in self._config + + def register(self): + """ + Add the project's taskgraph directory to the python path, and register + any extensions present. + """ + modify_path = os.path.dirname(self.root_dir) + if GraphConfig._PATH_MODIFIED: + if GraphConfig._PATH_MODIFIED == modify_path: + # Already modified path with the same root_dir. + # We currently need to do this to enable actions to call + # taskgraph_decision, e.g. relpro. + return + raise Exception("Can't register multiple directories on python path.") + GraphConfig._PATH_MODIFIED = modify_path + sys.path.insert(0, modify_path) + register_path = self["taskgraph"].get("register") + if register_path: + find_object(register_path)(self) + + @property + def vcs_root(self): + if path.split(self.root_dir)[-2:] != ["taskcluster", "ci"]: + raise Exception( + "Not guessing path to vcs root. " + "Graph config in non-standard location." + ) + return os.path.dirname(os.path.dirname(self.root_dir)) + + @property + def taskcluster_yml(self): + return os.path.join(self.vcs_root, ".taskcluster.yml") + + +def validate_graph_config(config): + validate_schema(graph_config_schema, config, "Invalid graph configuration:") + + +def load_graph_config(root_dir): + config_yml = os.path.join(root_dir, "config.yml") + if not os.path.exists(config_yml): + raise Exception(f"Couldn't find taskgraph configuration: {config_yml}") + + logger.debug(f"loading config from `{config_yml}`") + config = load_yaml(config_yml) + + validate_graph_config(config) + return GraphConfig(config, root_dir=root_dir) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/create.py b/third_party/python/taskcluster_taskgraph/taskgraph/create.py new file mode 100644 index 0000000000..deb1ac5348 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/create.py @@ -0,0 +1,132 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import json +import logging +import sys +from concurrent import futures + +from slugid import nice as slugid + +from taskgraph.util.parameterization import resolve_timestamps +from taskgraph.util.taskcluster import CONCURRENCY, get_session +from taskgraph.util.time import current_json_time + +logger = logging.getLogger(__name__) + +# this is set to true for `mach taskgraph action-callback --test` +testing = False + + +def create_tasks(graph_config, taskgraph, label_to_taskid, params, decision_task_id): + taskid_to_label = {t: l for l, t in label_to_taskid.items()} + + # when running as an actual decision task, we use the decision task's + # taskId as the taskGroupId. The process that created the decision task + # helpfully placed it in this same taskGroup. If there is no $TASK_ID, + # fall back to a slugid + scheduler_id = "{}-level-{}".format(graph_config["trust-domain"], params["level"]) + + # Add the taskGroupId, schedulerId and optionally the decision task + # dependency + for task_id in taskgraph.graph.nodes: + task_def = taskgraph.tasks[task_id].task + + # if this task has no dependencies *within* this taskgraph, make it + # depend on this decision task. If it has another dependency within + # the taskgraph, then it already implicitly depends on the decision + # task. The result is that tasks do not start immediately. if this + # loop fails halfway through, none of the already-created tasks run. + if not any(t in taskgraph.tasks for t in task_def.get("dependencies", [])): + task_def.setdefault("dependencies", []).append(decision_task_id) + + task_def["taskGroupId"] = decision_task_id + task_def["schedulerId"] = scheduler_id + + # If `testing` is True, then run without parallelization + concurrency = CONCURRENCY if not testing else 1 + session = get_session() + with futures.ThreadPoolExecutor(concurrency) as e: + fs = {} + + # We can't submit a task until its dependencies have been submitted. + # So our strategy is to walk the graph and submit tasks once all + # their dependencies have been submitted. + tasklist = set(taskgraph.graph.visit_postorder()) + alltasks = tasklist.copy() + + def schedule_tasks(): + # bail out early if any futures have failed + if any(f.done() and f.exception() for f in fs.values()): + return + + to_remove = set() + new = set() + + def submit(task_id, label, task_def): + fut = e.submit(create_task, session, task_id, label, task_def) + new.add(fut) + fs[task_id] = fut + + for task_id in tasklist: + task_def = taskgraph.tasks[task_id].task + # If we haven't finished submitting all our dependencies yet, + # come back to this later. + # Some dependencies aren't in our graph, so make sure to filter + # those out + deps = set(task_def.get("dependencies", [])) & alltasks + if any((d not in fs or not fs[d].done()) for d in deps): + continue + + submit(task_id, taskid_to_label[task_id], task_def) + to_remove.add(task_id) + + # Schedule tasks as many times as task_duplicates indicates + attributes = taskgraph.tasks[task_id].attributes + for i in range(1, attributes.get("task_duplicates", 1)): + # We use slugid() since we want a distinct task id + submit(slugid(), taskid_to_label[task_id], task_def) + tasklist.difference_update(to_remove) + + # as each of those futures complete, try to schedule more tasks + for f in futures.as_completed(new): + schedule_tasks() + + # start scheduling tasks and run until everything is scheduled + schedule_tasks() + + # check the result of each future, raising an exception if it failed + for f in futures.as_completed(fs.values()): + f.result() + + +def create_task(session, task_id, label, task_def): + # create the task using 'http://taskcluster/queue', which is proxied to the queue service + # with credentials appropriate to this job. + + # Resolve timestamps + now = current_json_time(datetime_format=True) + task_def = resolve_timestamps(now, task_def) + + if testing: + json.dump( + [task_id, task_def], + sys.stdout, + sort_keys=True, + indent=4, + separators=(",", ": "), + ) + # add a newline + print("") + return + + logger.info(f"Creating task with taskId {task_id} for {label}") + res = session.put(f"http://taskcluster/queue/v1/task/{task_id}", json=task_def) + if res.status_code != 200: + try: + logger.error(res.json()["message"]) + except Exception: + logger.error(res.text) + res.raise_for_status() diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/decision.py b/third_party/python/taskcluster_taskgraph/taskgraph/decision.py new file mode 100644 index 0000000000..ed412f4473 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/decision.py @@ -0,0 +1,379 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import json +import logging +import os +import pathlib +import shutil +import time +from pathlib import Path + +import yaml +from voluptuous import Optional + +from taskgraph.actions import render_actions_json +from taskgraph.create import create_tasks +from taskgraph.generator import TaskGraphGenerator +from taskgraph.parameters import Parameters, get_version +from taskgraph.taskgraph import TaskGraph +from taskgraph.util.python_path import find_object +from taskgraph.util.schema import Schema, validate_schema +from taskgraph.util.vcs import Repository, get_repository +from taskgraph.util.yaml import load_yaml + +logger = logging.getLogger(__name__) + +ARTIFACTS_DIR = Path("artifacts") + + +# For each project, this gives a set of parameters specific to the project. +# See `taskcluster/docs/parameters.rst` for information on parameters. +PER_PROJECT_PARAMETERS = { + # the default parameters are used for projects that do not match above. + "default": { + "target_tasks_method": "default", + } +} + + +try_task_config_schema_v2 = Schema( + { + Optional("parameters"): {str: object}, + } +) + + +def full_task_graph_to_runnable_jobs(full_task_json): + runnable_jobs = {} + for label, node in full_task_json.items(): + if not ("extra" in node["task"] and "treeherder" in node["task"]["extra"]): + continue + + th = node["task"]["extra"]["treeherder"] + runnable_jobs[label] = {"symbol": th["symbol"]} + + for i in ("groupName", "groupSymbol", "collection"): + if i in th: + runnable_jobs[label][i] = th[i] + if th.get("machine", {}).get("platform"): + runnable_jobs[label]["platform"] = th["machine"]["platform"] + return runnable_jobs + + +def taskgraph_decision(options, parameters=None): + """ + Run the decision task. This function implements `mach taskgraph decision`, + and is responsible for + + * processing decision task command-line options into parameters + * running task-graph generation exactly the same way the other `mach + taskgraph` commands do + * generating a set of artifacts to memorialize the graph + * calling TaskCluster APIs to create the graph + """ + + parameters = parameters or ( + lambda graph_config: get_decision_parameters(graph_config, options) + ) + + decision_task_id = os.environ["TASK_ID"] + + # create a TaskGraphGenerator instance + tgg = TaskGraphGenerator( + root_dir=options.get("root"), + parameters=parameters, + decision_task_id=decision_task_id, + write_artifacts=True, + ) + + # write out the parameters used to generate this graph + write_artifact("parameters.yml", dict(**tgg.parameters)) + + # write out the public/actions.json file + write_artifact( + "actions.json", + render_actions_json(tgg.parameters, tgg.graph_config, decision_task_id), + ) + + # write out the full graph for reference + full_task_json = tgg.full_task_graph.to_json() + write_artifact("full-task-graph.json", full_task_json) + + # write out the public/runnable-jobs.json file + write_artifact( + "runnable-jobs.json", full_task_graph_to_runnable_jobs(full_task_json) + ) + + # this is just a test to check whether the from_json() function is working + _, _ = TaskGraph.from_json(full_task_json) + + # write out the target task set to allow reproducing this as input + write_artifact("target-tasks.json", list(tgg.target_task_set.tasks.keys())) + + # write out the optimized task graph to describe what will actually happen, + # and the map of labels to taskids + write_artifact("task-graph.json", tgg.morphed_task_graph.to_json()) + write_artifact("label-to-taskid.json", tgg.label_to_taskid) + + # write out current run-task and fetch-content scripts + RUN_TASK_DIR = pathlib.Path(__file__).parent / "run-task" + shutil.copy2(RUN_TASK_DIR / "run-task", ARTIFACTS_DIR) + shutil.copy2(RUN_TASK_DIR / "fetch-content", ARTIFACTS_DIR) + + # actually create the graph + create_tasks( + tgg.graph_config, + tgg.morphed_task_graph, + tgg.label_to_taskid, + tgg.parameters, + decision_task_id=decision_task_id, + ) + + +def get_decision_parameters(graph_config, options): + """ + Load parameters from the command-line options for 'taskgraph decision'. + This also applies per-project parameters, based on the given project. + + """ + parameters = { + n: options[n] + for n in [ + "base_repository", + "base_ref", + "base_rev", + "head_repository", + "head_rev", + "head_ref", + "head_tag", + "project", + "pushlog_id", + "pushdate", + "repository_type", + "owner", + "level", + "target_tasks_method", + "tasks_for", + ] + if n in options + } + + repo_path = os.getcwd() + repo = get_repository(repo_path) + try: + commit_message = repo.get_commit_message() + except UnicodeDecodeError: + commit_message = "" + + parameters["base_ref"] = _determine_more_accurate_base_ref( + repo, + candidate_base_ref=options.get("base_ref"), + head_ref=options.get("head_ref"), + base_rev=options.get("base_rev"), + ) + + parameters["base_rev"] = _determine_more_accurate_base_rev( + repo, + base_ref=parameters["base_ref"], + candidate_base_rev=options.get("base_rev"), + head_rev=options.get("head_rev"), + env_prefix=_get_env_prefix(graph_config), + ) + + # Define default filter list, as most configurations shouldn't need + # custom filters. + parameters["filters"] = [ + "target_tasks_method", + ] + parameters["optimize_strategies"] = None + parameters["optimize_target_tasks"] = True + parameters["existing_tasks"] = {} + parameters["do_not_optimize"] = [] + parameters["enable_always_target"] = True + parameters["build_number"] = 1 + parameters["version"] = get_version(repo_path) + parameters["next_version"] = None + + # owner must be an email, but sometimes (e.g., for ffxbld) it is not, in which + # case, fake it + if "@" not in parameters["owner"]: + parameters["owner"] += "@noreply.mozilla.org" + + # use the pushdate as build_date if given, else use current time + parameters["build_date"] = parameters["pushdate"] or int(time.time()) + # moz_build_date is the build identifier based on build_date + parameters["moz_build_date"] = time.strftime( + "%Y%m%d%H%M%S", time.gmtime(parameters["build_date"]) + ) + + project = parameters["project"] + try: + parameters.update(PER_PROJECT_PARAMETERS[project]) + except KeyError: + logger.warning( + "using default project parameters; add {} to " + "PER_PROJECT_PARAMETERS in {} to customize behavior " + "for this project".format(project, __file__) + ) + parameters.update(PER_PROJECT_PARAMETERS["default"]) + + # `target_tasks_method` has higher precedence than `project` parameters + if options.get("target_tasks_method"): + parameters["target_tasks_method"] = options["target_tasks_method"] + + # ..but can be overridden by the commit message: if it contains the special + # string "DONTBUILD" and this is an on-push decision task, then use the + # special 'nothing' target task method. + if "DONTBUILD" in commit_message and ( + options["tasks_for"] in ("hg-push", "github-push") + ): + parameters["target_tasks_method"] = "nothing" + + if options.get("optimize_target_tasks") is not None: + parameters["optimize_target_tasks"] = options["optimize_target_tasks"] + + if "decision-parameters" in graph_config["taskgraph"]: + find_object(graph_config["taskgraph"]["decision-parameters"])( + graph_config, parameters + ) + + if options.get("try_task_config_file"): + task_config_file = os.path.abspath(options.get("try_task_config_file")) + else: + # if try_task_config.json is present, load it + task_config_file = os.path.join(os.getcwd(), "try_task_config.json") + + # load try settings + if ("try" in project and options["tasks_for"] == "hg-push") or options[ + "tasks_for" + ] == "github-pull-request": + set_try_config(parameters, task_config_file) + + result = Parameters(**parameters) + result.check() + return result + + +def _determine_more_accurate_base_ref(repo, candidate_base_ref, head_ref, base_rev): + base_ref = candidate_base_ref + + if not candidate_base_ref: + base_ref = repo.default_branch + elif candidate_base_ref == head_ref and base_rev == Repository.NULL_REVISION: + logger.info( + "base_ref and head_ref are identical but base_rev equals the null revision. " + "This is a new branch but Github didn't identify its actual base." + ) + base_ref = repo.default_branch + + if base_ref != candidate_base_ref: + logger.info( + f'base_ref has been reset from "{candidate_base_ref}" to "{base_ref}".' + ) + + return base_ref + + +def _determine_more_accurate_base_rev( + repo, base_ref, candidate_base_rev, head_rev, env_prefix +): + if not candidate_base_rev: + logger.info("base_rev is not set.") + base_ref_or_rev = base_ref + elif candidate_base_rev == Repository.NULL_REVISION: + logger.info("base_rev equals the null revision. This branch is a new one.") + base_ref_or_rev = base_ref + elif not repo.does_revision_exist_locally(candidate_base_rev): + logger.warning( + "base_rev does not exist locally. It is likely because the branch was force-pushed. " + "taskgraph is not able to assess how many commits were changed and assumes it is only " + f"the last one. Please set the {env_prefix.upper()}_BASE_REV environment variable " + "in the decision task and provide `--base-rev` to taskgraph." + ) + base_ref_or_rev = base_ref + else: + base_ref_or_rev = candidate_base_rev + + if base_ref_or_rev == base_ref: + logger.info( + f'Using base_ref "{base_ref}" to determine latest common revision...' + ) + + base_rev = repo.find_latest_common_revision(base_ref_or_rev, head_rev) + if base_rev != candidate_base_rev: + if base_ref_or_rev == candidate_base_rev: + logger.info("base_rev is not an ancestor of head_rev.") + + logger.info( + f'base_rev has been reset from "{candidate_base_rev}" to "{base_rev}".' + ) + + return base_rev + + +def _get_env_prefix(graph_config): + repo_keys = list(graph_config["taskgraph"].get("repositories", {}).keys()) + return repo_keys[0] if repo_keys else "" + + +def set_try_config(parameters, task_config_file): + if os.path.isfile(task_config_file): + logger.info(f"using try tasks from {task_config_file}") + with open(task_config_file) as fh: + task_config = json.load(fh) + task_config_version = task_config.pop("version") + if task_config_version == 2: + validate_schema( + try_task_config_schema_v2, + task_config, + "Invalid v2 `try_task_config.json`.", + ) + parameters.update(task_config["parameters"]) + return + else: + raise Exception( + f"Unknown `try_task_config.json` version: {task_config_version}" + ) + + +def write_artifact(filename, data): + logger.info(f"writing artifact file `{filename}`") + if not os.path.isdir(ARTIFACTS_DIR): + os.mkdir(ARTIFACTS_DIR) + path = ARTIFACTS_DIR / filename + if filename.endswith(".yml"): + with open(path, "w") as f: + yaml.safe_dump(data, f, allow_unicode=True, default_flow_style=False) + elif filename.endswith(".json"): + with open(path, "w") as f: + json.dump(data, f, sort_keys=True, indent=2, separators=(",", ": ")) + elif filename.endswith(".gz"): + import gzip + + with gzip.open(path, "wb") as f: + f.write(json.dumps(data)) + else: + raise TypeError(f"Don't know how to write to {filename}") + + +def read_artifact(filename): + path = ARTIFACTS_DIR / filename + if filename.endswith(".yml"): + return load_yaml(path, filename) + elif filename.endswith(".json"): + with open(path) as f: + return json.load(f) + elif filename.endswith(".gz"): + import gzip + + with gzip.open(path, "rb") as f: + return json.load(f) + else: + raise TypeError(f"Don't know how to read {filename}") + + +def rename_artifact(src, dest): + os.rename(ARTIFACTS_DIR / src, ARTIFACTS_DIR / dest) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/docker.py b/third_party/python/taskcluster_taskgraph/taskgraph/docker.py new file mode 100644 index 0000000000..23897cbbee --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/docker.py @@ -0,0 +1,219 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import json +import os +import subprocess +import tarfile +from io import BytesIO +from textwrap import dedent + +try: + import zstandard as zstd +except ImportError as e: + zstd = e + +from taskgraph.util import docker +from taskgraph.util.taskcluster import get_artifact_url, get_session + + +def get_image_digest(image_name): + from taskgraph.generator import load_tasks_for_kind + from taskgraph.parameters import Parameters + + params = Parameters( + level=os.environ.get("MOZ_SCM_LEVEL", "3"), + strict=False, + ) + tasks = load_tasks_for_kind(params, "docker-image") + task = tasks[f"build-docker-image-{image_name}"] + return task.attributes["cached_task"]["digest"] + + +def load_image_by_name(image_name, tag=None): + from taskgraph.generator import load_tasks_for_kind + from taskgraph.optimize import IndexSearch + from taskgraph.parameters import Parameters + + params = Parameters( + level=os.environ.get("MOZ_SCM_LEVEL", "3"), + strict=False, + ) + tasks = load_tasks_for_kind(params, "docker-image") + task = tasks[f"build-docker-image-{image_name}"] + task_id = IndexSearch().should_replace_task( + task, {}, task.optimization.get("index-search", []) + ) + + if task_id in (True, False): + print( + "Could not find artifacts for a docker image " + "named `{image_name}`. Local commits and other changes " + "in your checkout may cause this error. Try " + "updating to a fresh checkout of mozilla-central " + "to download image.".format(image_name=image_name) + ) + return False + + return load_image_by_task_id(task_id, tag) + + +def load_image_by_task_id(task_id, tag=None): + artifact_url = get_artifact_url(task_id, "public/image.tar.zst") + result = load_image(artifact_url, tag) + print("Found docker image: {}:{}".format(result["image"], result["tag"])) + if tag: + print(f"Re-tagged as: {tag}") + else: + tag = "{}:{}".format(result["image"], result["tag"]) + print(f"Try: docker run -ti --rm {tag} bash") + return True + + +def build_context(name, outputFile, args=None): + """Build a context.tar for image with specified name.""" + if not name: + raise ValueError("must provide a Docker image name") + if not outputFile: + raise ValueError("must provide a outputFile") + + image_dir = docker.image_path(name) + if not os.path.isdir(image_dir): + raise Exception("image directory does not exist: %s" % image_dir) + + docker.create_context_tar(".", image_dir, outputFile, args) + + +def build_image(name, tag, args=None): + """Build a Docker image of specified name. + + Output from image building process will be printed to stdout. + """ + if not name: + raise ValueError("must provide a Docker image name") + + image_dir = docker.image_path(name) + if not os.path.isdir(image_dir): + raise Exception("image directory does not exist: %s" % image_dir) + + tag = tag or docker.docker_image(name, by_tag=True) + + buf = BytesIO() + docker.stream_context_tar(".", image_dir, buf, "", args) + subprocess.run( + ["docker", "image", "build", "--no-cache", "-t", tag, "-"], input=buf.getvalue() + ) + + print(f"Successfully built {name} and tagged with {tag}") + + if tag.endswith(":latest"): + print("*" * 50) + print("WARNING: no VERSION file found in image directory.") + print("Image is not suitable for deploying/pushing.") + print("Create an image suitable for deploying/pushing by creating") + print("a VERSION file in the image directory.") + print("*" * 50) + + +def load_image(url, imageName=None, imageTag=None): + """ + Load docker image from URL as imageName:tag, if no imageName or tag is given + it will use whatever is inside the zstd compressed tarball. + + Returns an object with properties 'image', 'tag' and 'layer'. + """ + if isinstance(zstd, ImportError): + raise ImportError( + dedent( + """ + zstandard is not installed! Use `pip install taskcluster-taskgraph[load-image]` + to use this feature. + """ + ) + ) from zstd + + # If imageName is given and we don't have an imageTag + # we parse out the imageTag from imageName, or default it to 'latest' + # if no imageName and no imageTag is given, 'repositories' won't be rewritten + if imageName and not imageTag: + if ":" in imageName: + imageName, imageTag = imageName.split(":", 1) + else: + imageTag = "latest" + + info = {} + + def download_and_modify_image(): + # This function downloads and edits the downloaded tar file on the fly. + # It emits chunked buffers of the edited tar file, as a generator. + print(f"Downloading from {url}") + # get_session() gets us a requests.Session set to retry several times. + req = get_session().get(url, stream=True) + req.raise_for_status() + + with zstd.ZstdDecompressor().stream_reader(req.raw) as ifh: + tarin = tarfile.open( + mode="r|", + fileobj=ifh, + bufsize=zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE, + ) + + # Stream through each member of the downloaded tar file individually. + for member in tarin: + # Non-file members only need a tar header. Emit one. + if not member.isfile(): + yield member.tobuf(tarfile.GNU_FORMAT) + continue + + # Open stream reader for the member + reader = tarin.extractfile(member) + + # If member is `repositories`, we parse and possibly rewrite the + # image tags. + if member.name == "repositories": + # Read and parse repositories + repos = json.loads(reader.read()) + reader.close() + + # If there is more than one image or tag, we can't handle it + # here. + if len(repos.keys()) > 1: + raise Exception("file contains more than one image") + info["image"] = image = list(repos.keys())[0] + if len(repos[image].keys()) > 1: + raise Exception("file contains more than one tag") + info["tag"] = tag = list(repos[image].keys())[0] + info["layer"] = layer = repos[image][tag] + + # Rewrite the repositories file + data = json.dumps({imageName or image: {imageTag or tag: layer}}) + reader = BytesIO(data.encode("utf-8")) + member.size = len(data) + + # Emit the tar header for this member. + yield member.tobuf(tarfile.GNU_FORMAT) + # Then emit its content. + remaining = member.size + while remaining: + length = min(remaining, zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE) + buf = reader.read(length) + remaining -= len(buf) + yield buf + # Pad to fill a 512 bytes block, per tar format. + remainder = member.size % 512 + if remainder: + yield ("\0" * (512 - remainder)).encode("utf-8") + + reader.close() + + subprocess.run( + ["docker", "image", "load"], input=b"".join(download_and_modify_image()) + ) + + # Check that we found a repositories file + if not info.get("image") or not info.get("tag") or not info.get("layer"): + raise Exception("No repositories file found!") + + return info diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/files_changed.py b/third_party/python/taskcluster_taskgraph/taskgraph/files_changed.py new file mode 100644 index 0000000000..6be6e5eeee --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/files_changed.py @@ -0,0 +1,91 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Support for optimizing tasks based on the set of files that have changed. +""" + + +import logging +import os + +import requests +from redo import retry + +from .util.memoize import memoize +from .util.path import match as match_path +from .util.vcs import get_repository + +logger = logging.getLogger(__name__) + + +@memoize +def get_changed_files(head_repository_url, head_rev, base_rev=None): + """ + Get the set of files changed between revisions. + Responses are cached, so multiple calls with the same arguments are OK. + """ + repo_path = os.getcwd() + repository = get_repository(repo_path) + + if repository.tool == "hg": + # TODO Use VCS version once tested enough + return _get_changed_files_json_automationrelevance( + head_repository_url, head_rev + ) + + return repository.get_changed_files(rev=head_rev, base_rev=base_rev) + + +def _get_changed_files_json_automationrelevance(head_repository_url, head_rev): + """ + Get the set of files changed in the push headed by the given revision. + """ + url = "{}/json-automationrelevance/{}".format( + head_repository_url.rstrip("/"), head_rev + ) + logger.debug("Querying version control for metadata: %s", url) + + def get_automationrelevance(): + response = requests.get(url, timeout=30) + return response.json() + + contents = retry(get_automationrelevance, attempts=10, sleeptime=10) + + logger.debug( + "{} commits influencing task scheduling:".format(len(contents["changesets"])) + ) + changed_files = set() + for c in contents["changesets"]: + desc = "" # Support empty desc + if c["desc"]: + desc = c["desc"].splitlines()[0].encode("ascii", "ignore") + logger.debug(" {cset} {desc}".format(cset=c["node"][0:12], desc=desc)) + changed_files |= set(c["files"]) + + return changed_files + + +def check(params, file_patterns): + """Determine whether any of the files changed between 2 revisions + match any of the given file patterns.""" + + head_repository_url = params.get("head_repository") + head_rev = params.get("head_rev") + if not head_repository_url or not head_rev: + logger.warning( + "Missing `head_repository` or `head_rev` parameters; " + "assuming all files have changed" + ) + return True + + base_rev = params.get("base_rev") + changed_files = get_changed_files(head_repository_url, head_rev, base_rev) + + for pattern in file_patterns: + for path in changed_files: + if match_path(path, pattern): + return True + + return False diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/filter_tasks.py b/third_party/python/taskcluster_taskgraph/taskgraph/filter_tasks.py new file mode 100644 index 0000000000..63bd2874d6 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/filter_tasks.py @@ -0,0 +1,34 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging + +from . import target_tasks + +logger = logging.getLogger(__name__) + +filter_task_functions = {} + + +def filter_task(name): + """Generator to declare a task filter function.""" + + def wrap(func): + filter_task_functions[name] = func + return func + + return wrap + + +@filter_task("target_tasks_method") +def filter_target_tasks(graph, parameters, graph_config): + """Proxy filter to use legacy target tasks code. + + This should go away once target_tasks are converted to filters. + """ + + attr = parameters.get("target_tasks_method", "all_tasks") + fn = target_tasks.get_method(attr) + return fn(graph, parameters, graph_config) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/generator.py b/third_party/python/taskcluster_taskgraph/taskgraph/generator.py new file mode 100644 index 0000000000..4ed2a41520 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/generator.py @@ -0,0 +1,451 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import copy +import logging +import os +from dataclasses import dataclass +from typing import Dict + +from . import filter_tasks +from .config import GraphConfig, load_graph_config +from .graph import Graph +from .morph import morph +from .optimize.base import optimize_task_graph +from .parameters import parameters_loader +from .task import Task +from .taskgraph import TaskGraph +from .transforms.base import TransformConfig, TransformSequence +from .util.python_path import find_object +from .util.verify import verifications +from .util.yaml import load_yaml + +logger = logging.getLogger(__name__) + + +class KindNotFound(Exception): + """ + Raised when trying to load kind from a directory without a kind.yml. + """ + + +@dataclass(frozen=True) +class Kind: + name: str + path: str + config: Dict + graph_config: GraphConfig + + def _get_loader(self): + try: + loader = self.config["loader"] + except KeyError: + loader = "taskgraph.loader.default:loader" + return find_object(loader) + + def load_tasks(self, parameters, loaded_tasks, write_artifacts): + loader = self._get_loader() + config = copy.deepcopy(self.config) + + kind_dependencies = config.get("kind-dependencies", []) + kind_dependencies_tasks = { + task.label: task for task in loaded_tasks if task.kind in kind_dependencies + } + + inputs = loader(self.name, self.path, config, parameters, loaded_tasks) + + transforms = TransformSequence() + for xform_path in config["transforms"]: + if ":" not in xform_path: + xform_path = f"{xform_path}:transforms" + + transform = find_object(xform_path) + transforms.add(transform) + + # perform the transformations on the loaded inputs + trans_config = TransformConfig( + self.name, + self.path, + config, + parameters, + kind_dependencies_tasks, + self.graph_config, + write_artifacts=write_artifacts, + ) + tasks = [ + Task( + self.name, + label=task_dict["label"], + description=task_dict["description"], + attributes=task_dict["attributes"], + task=task_dict["task"], + optimization=task_dict.get("optimization"), + dependencies=task_dict.get("dependencies"), + soft_dependencies=task_dict.get("soft-dependencies"), + if_dependencies=task_dict.get("if-dependencies"), + ) + for task_dict in transforms(trans_config, inputs) + ] + return tasks + + @classmethod + def load(cls, root_dir, graph_config, kind_name): + path = os.path.join(root_dir, kind_name) + kind_yml = os.path.join(path, "kind.yml") + if not os.path.exists(kind_yml): + raise KindNotFound(kind_yml) + + logger.debug(f"loading kind `{kind_name}` from `{path}`") + config = load_yaml(kind_yml) + + return cls(kind_name, path, config, graph_config) + + +class TaskGraphGenerator: + """ + The central controller for taskgraph. This handles all phases of graph + generation. The task is generated from all of the kinds defined in + subdirectories of the generator's root directory. + + Access to the results of this generation, as well as intermediate values at + various phases of generation, is available via properties. This encourages + the provision of all generation inputs at instance construction time. + """ + + # Task-graph generation is implemented as a Python generator that yields + # each "phase" of generation. This allows some mach subcommands to short- + # circuit generation of the entire graph by never completing the generator. + + def __init__( + self, + root_dir, + parameters, + decision_task_id="DECISION-TASK", + write_artifacts=False, + ): + """ + @param root_dir: root directory, with subdirectories for each kind + @param parameters: parameters for this task-graph generation, or callable + taking a `GraphConfig` and returning parameters + @type parameters: Union[Parameters, Callable[[GraphConfig], Parameters]] + """ + if root_dir is None: + root_dir = "taskcluster/ci" + self.root_dir = root_dir + self._parameters = parameters + self._decision_task_id = decision_task_id + self._write_artifacts = write_artifacts + + # start the generator + self._run = self._run() + self._run_results = {} + + @property + def parameters(self): + """ + The properties used for this graph. + + @type: Properties + """ + return self._run_until("parameters") + + @property + def full_task_set(self): + """ + The full task set: all tasks defined by any kind (a graph without edges) + + @type: TaskGraph + """ + return self._run_until("full_task_set") + + @property + def full_task_graph(self): + """ + The full task graph: the full task set, with edges representing + dependencies. + + @type: TaskGraph + """ + return self._run_until("full_task_graph") + + @property + def target_task_set(self): + """ + The set of targeted tasks (a graph without edges) + + @type: TaskGraph + """ + return self._run_until("target_task_set") + + @property + def target_task_graph(self): + """ + The set of targeted tasks and all of their dependencies + + @type: TaskGraph + """ + return self._run_until("target_task_graph") + + @property + def optimized_task_graph(self): + """ + The set of targeted tasks and all of their dependencies; tasks that + have been optimized out are either omitted or replaced with a Task + instance containing only a task_id. + + @type: TaskGraph + """ + return self._run_until("optimized_task_graph") + + @property + def label_to_taskid(self): + """ + A dictionary mapping task label to assigned taskId. This property helps + in interpreting `optimized_task_graph`. + + @type: dictionary + """ + return self._run_until("label_to_taskid") + + @property + def morphed_task_graph(self): + """ + The optimized task graph, with any subsequent morphs applied. This graph + will have the same meaning as the optimized task graph, but be in a form + more palatable to TaskCluster. + + @type: TaskGraph + """ + return self._run_until("morphed_task_graph") + + @property + def graph_config(self): + """ + The configuration for this graph. + + @type: TaskGraph + """ + return self._run_until("graph_config") + + def _load_kinds(self, graph_config, target_kinds=None): + if target_kinds: + # docker-image is an implicit dependency that never appears in + # kind-dependencies. + queue = target_kinds + ["docker-image"] + seen_kinds = set() + while queue: + kind_name = queue.pop() + if kind_name in seen_kinds: + continue + seen_kinds.add(kind_name) + kind = Kind.load(self.root_dir, graph_config, kind_name) + yield kind + queue.extend(kind.config.get("kind-dependencies", [])) + else: + for kind_name in os.listdir(self.root_dir): + try: + yield Kind.load(self.root_dir, graph_config, kind_name) + except KindNotFound: + continue + + def _run(self): + logger.info("Loading graph configuration.") + graph_config = load_graph_config(self.root_dir) + + yield ("graph_config", graph_config) + + graph_config.register() + + # Initial verifications that don't depend on any generation state. + verifications("initial") + + if callable(self._parameters): + parameters = self._parameters(graph_config) + else: + parameters = self._parameters + + logger.info(f"Using {parameters}") + logger.debug(f"Dumping parameters:\n{repr(parameters)}") + + filters = parameters.get("filters", []) + # Always add legacy target tasks method until we deprecate that API. + if "target_tasks_method" not in filters: + filters.insert(0, "target_tasks_method") + filters = [filter_tasks.filter_task_functions[f] for f in filters] + + yield self.verify("parameters", parameters) + + logger.info("Loading kinds") + # put the kinds into a graph and sort topologically so that kinds are loaded + # in post-order + target_kinds = sorted(parameters.get("target-kinds", [])) + if target_kinds: + logger.info( + "Limiting kinds to following kinds and dependencies: {}".format( + ", ".join(target_kinds) + ) + ) + kinds = { + kind.name: kind for kind in self._load_kinds(graph_config, target_kinds) + } + verifications("kinds", kinds) + + edges = set() + for kind in kinds.values(): + for dep in kind.config.get("kind-dependencies", []): + edges.add((kind.name, dep, "kind-dependency")) + kind_graph = Graph(set(kinds), edges) + + if target_kinds: + kind_graph = kind_graph.transitive_closure( + set(target_kinds) | {"docker-image"} + ) + + logger.info("Generating full task set") + all_tasks = {} + for kind_name in kind_graph.visit_postorder(): + logger.debug(f"Loading tasks for kind {kind_name}") + kind = kinds[kind_name] + try: + new_tasks = kind.load_tasks( + parameters, + list(all_tasks.values()), + self._write_artifacts, + ) + except Exception: + logger.exception(f"Error loading tasks for kind {kind_name}:") + raise + for task in new_tasks: + if task.label in all_tasks: + raise Exception("duplicate tasks with label " + task.label) + all_tasks[task.label] = task + logger.info(f"Generated {len(new_tasks)} tasks for kind {kind_name}") + full_task_set = TaskGraph(all_tasks, Graph(set(all_tasks), set())) + yield self.verify("full_task_set", full_task_set, graph_config, parameters) + + logger.info("Generating full task graph") + edges = set() + for t in full_task_set: + for depname, dep in t.dependencies.items(): + if dep not in all_tasks.keys(): + raise Exception( + f"Task '{t.label}' lists a dependency that does not exist: '{dep}'" + ) + edges.add((t.label, dep, depname)) + + full_task_graph = TaskGraph(all_tasks, Graph(full_task_set.graph.nodes, edges)) + logger.info( + "Full task graph contains %d tasks and %d dependencies" + % (len(full_task_set.graph.nodes), len(edges)) + ) + yield self.verify("full_task_graph", full_task_graph, graph_config, parameters) + + logger.info("Generating target task set") + target_task_set = TaskGraph( + dict(all_tasks), Graph(set(all_tasks.keys()), set()) + ) + for fltr in filters: + old_len = len(target_task_set.graph.nodes) + target_tasks = set(fltr(target_task_set, parameters, graph_config)) + target_task_set = TaskGraph( + {l: all_tasks[l] for l in target_tasks}, Graph(target_tasks, set()) + ) + logger.info( + "Filter %s pruned %d tasks (%d remain)" + % (fltr.__name__, old_len - len(target_tasks), len(target_tasks)) + ) + + yield self.verify("target_task_set", target_task_set, graph_config, parameters) + + logger.info("Generating target task graph") + # include all tasks with `always_target` set + if parameters["enable_always_target"]: + always_target_tasks = { + t.label + for t in full_task_graph.tasks.values() + if t.attributes.get("always_target") + if parameters["enable_always_target"] is True + or t.kind in parameters["enable_always_target"] + } + else: + always_target_tasks = set() + logger.info( + "Adding %d tasks with `always_target` attribute" + % (len(always_target_tasks) - len(always_target_tasks & target_tasks)) + ) + requested_tasks = target_tasks | always_target_tasks + target_graph = full_task_graph.graph.transitive_closure(requested_tasks) + target_task_graph = TaskGraph( + {l: all_tasks[l] for l in target_graph.nodes}, target_graph + ) + yield self.verify( + "target_task_graph", target_task_graph, graph_config, parameters + ) + + logger.info("Generating optimized task graph") + existing_tasks = parameters.get("existing_tasks") + do_not_optimize = set(parameters.get("do_not_optimize", [])) + if not parameters.get("optimize_target_tasks", True): + do_not_optimize = set(target_task_set.graph.nodes).union(do_not_optimize) + + # this is used for testing experimental optimization strategies + strategies = os.environ.get( + "TASKGRAPH_OPTIMIZE_STRATEGIES", parameters.get("optimize_strategies") + ) + if strategies: + strategies = find_object(strategies) + + optimized_task_graph, label_to_taskid = optimize_task_graph( + target_task_graph, + requested_tasks, + parameters, + do_not_optimize, + self._decision_task_id, + existing_tasks=existing_tasks, + strategy_override=strategies, + ) + + yield self.verify( + "optimized_task_graph", optimized_task_graph, graph_config, parameters + ) + + morphed_task_graph, label_to_taskid = morph( + optimized_task_graph, label_to_taskid, parameters, graph_config + ) + + yield "label_to_taskid", label_to_taskid + yield self.verify( + "morphed_task_graph", morphed_task_graph, graph_config, parameters + ) + + def _run_until(self, name): + while name not in self._run_results: + try: + k, v = next(self._run) + except StopIteration: + raise AttributeError(f"No such run result {name}") + self._run_results[k] = v + return self._run_results[name] + + def verify(self, name, obj, *args, **kwargs): + verifications(name, obj, *args, **kwargs) + return name, obj + + +def load_tasks_for_kind(parameters, kind, root_dir=None): + """ + Get all the tasks of a given kind. + + This function is designed to be called from outside of taskgraph. + """ + # make parameters read-write + parameters = dict(parameters) + parameters["target-kinds"] = [kind] + parameters = parameters_loader(spec=None, strict=False, overrides=parameters) + tgg = TaskGraphGenerator(root_dir=root_dir, parameters=parameters) + return { + task.task["metadata"]["name"]: task + for task in tgg.full_task_set + if task.kind == kind + } diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/graph.py b/third_party/python/taskcluster_taskgraph/taskgraph/graph.py new file mode 100644 index 0000000000..36b7f14984 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/graph.py @@ -0,0 +1,134 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import collections +from dataclasses import dataclass +from typing import FrozenSet + + +@dataclass(frozen=True) +class Graph: + """Generic representation of a directed acyclic graph with labeled edges + connecting the nodes. Graph operations are implemented in a functional + manner, so the data structure is immutable. + + It permits at most one edge of a given name between any set of nodes. The + graph is not checked for cycles, and methods may hang or otherwise fail if + given a cyclic graph. + + The `nodes` and `edges` attributes may be accessed in a read-only fashion. + The `nodes` attribute is a set of node names, while `edges` is a set of + `(left, right, name)` tuples representing an edge named `name` going from + node `left` to node `right`.. + """ + + nodes: FrozenSet + edges: FrozenSet + + def transitive_closure(self, nodes, reverse=False): + """Return the transitive closure of <nodes>: the graph containing all + specified nodes as well as any nodes reachable from them, and any + intervening edges. + + If `reverse` is true, the "reachability" will be reversed and this + will return the set of nodes that can reach the specified nodes. + + Example: + + .. code-block:: + + a ------> b ------> c + | + `-------> d + + transitive_closure([b]).nodes == set([a, b]) + transitive_closure([c]).nodes == set([c, b, a]) + transitive_closure([c], reverse=True).nodes == set([c]) + transitive_closure([b], reverse=True).nodes == set([b, c, d]) + """ + assert isinstance(nodes, set) + if not (nodes <= self.nodes): + raise Exception( + f"Unknown nodes in transitive closure: {nodes - self.nodes}" + ) + + # generate a new graph by expanding along edges until reaching a fixed + # point + new_nodes, new_edges = nodes, set() + nodes, edges = set(), set() + while (new_nodes, new_edges) != (nodes, edges): + nodes, edges = new_nodes, new_edges + add_edges = { + (left, right, name) + for (left, right, name) in self.edges + if (right if reverse else left) in nodes + } + add_nodes = {(left if reverse else right) for (left, right, _) in add_edges} + new_nodes = nodes | add_nodes + new_edges = edges | add_edges + return Graph(new_nodes, new_edges) + + def _visit(self, reverse): + queue = collections.deque(sorted(self.nodes)) + links_by_node = self.reverse_links_dict() if reverse else self.links_dict() + seen = set() + while queue: + node = queue.popleft() + if node in seen: + continue + links = links_by_node[node] + if all((n in seen) for n in links): + seen.add(node) + yield node + else: + queue.extend(n for n in links if n not in seen) + queue.append(node) + + def visit_postorder(self): + """ + Generate a sequence of nodes in postorder, such that every node is + visited *after* any nodes it links to. + + Behavior is undefined (read: it will hang) if the graph contains a + cycle. + """ + return self._visit(False) + + def visit_preorder(self): + """ + Like visit_postorder, but in reverse: evrey node is visited *before* + any nodes it links to. + """ + return self._visit(True) + + def links_dict(self): + """ + Return a dictionary mapping each node to a set of the nodes it links to + (omitting edge names) + """ + links = collections.defaultdict(set) + for left, right, _ in self.edges: + links[left].add(right) + return links + + def named_links_dict(self): + """ + Return a two-level dictionary mapping each node to a dictionary mapping + edge names to labels. + """ + links = collections.defaultdict(dict) + for left, right, name in self.edges: + links[left][name] = right + return links + + def reverse_links_dict(self): + """ + Return a dictionary mapping each node to a set of the nodes linking to + it (omitting edge names) + """ + links = collections.defaultdict(set) + for left, right, _ in self.edges: + links[right].add(left) + return links diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/loader/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/loader/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/loader/__init__.py diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/loader/default.py b/third_party/python/taskcluster_taskgraph/taskgraph/loader/default.py new file mode 100644 index 0000000000..5b2c258917 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/loader/default.py @@ -0,0 +1,33 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging + +from .transform import loader as transform_loader + +logger = logging.getLogger(__name__) + + +DEFAULT_TRANSFORMS = [ + "taskgraph.transforms.job:transforms", + "taskgraph.transforms.task:transforms", +] + + +def loader(kind, path, config, params, loaded_tasks): + """ + This default loader builds on the `transform` loader by providing sensible + default transforms that the majority of simple tasks will need. + Specifically, `job` and `task` transforms will be appended to the end of the + list of transforms in the kind being loaded. + """ + transform_refs = config.setdefault("transforms", []) + for t in DEFAULT_TRANSFORMS: + if t in config.get("transforms", ()): + raise KeyError( + f"Transform {t} is already present in the loader's default transforms; it must not be defined in the kind" + ) + transform_refs.extend(DEFAULT_TRANSFORMS) + return transform_loader(kind, path, config, params, loaded_tasks) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/loader/transform.py b/third_party/python/taskcluster_taskgraph/taskgraph/loader/transform.py new file mode 100644 index 0000000000..a134ffd127 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/loader/transform.py @@ -0,0 +1,58 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging + +from taskgraph.util.templates import merge +from taskgraph.util.yaml import load_yaml + +logger = logging.getLogger(__name__) + + +def loader(kind, path, config, params, loaded_tasks): + """ + Get the input elements that will be transformed into tasks in a generic + way. The elements themselves are free-form, and become the input to the + first transform. + + By default, this reads tasks from the `tasks` key, or from yaml files + named by `tasks-from`. The entities are read from mappings, and the + keys to those mappings are added in the `name` key of each entity. + + If there is a `task-defaults` config, then every task is merged with it. + This provides a simple way to set default values for all tasks of a kind. + The `task-defaults` key can also be specified in a yaml file pointed to by + `tasks-from`. In this case it will only apply to tasks defined in the same + file. + + Other kind implementations can use a different loader function to + produce inputs and hand them to `transform_inputs`. + """ + + def generate_tasks(): + defaults = config.get("task-defaults") + for name, task in config.get("tasks", {}).items(): + if defaults: + task = merge(defaults, task) + task["task-from"] = "kind.yml" + yield name, task + + for filename in config.get("tasks-from", []): + tasks = load_yaml(path, filename) + + file_defaults = tasks.pop("task-defaults", None) + if defaults: + file_defaults = merge(defaults, file_defaults or {}) + + for name, task in tasks.items(): + if file_defaults: + task = merge(file_defaults, task) + task["task-from"] = filename + yield name, task + + for name, task in generate_tasks(): + task["name"] = name + logger.debug(f"Generating tasks for {kind} {name}") + yield task diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/main.py b/third_party/python/taskcluster_taskgraph/taskgraph/main.py new file mode 100644 index 0000000000..88a4e2539b --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/main.py @@ -0,0 +1,875 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import argparse +import atexit +import json +import logging +import os +import re +import shutil +import subprocess +import sys +import tempfile +import traceback +from collections import namedtuple +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from textwrap import dedent +from typing import Any, List + +import appdirs +import yaml + +Command = namedtuple("Command", ["func", "args", "kwargs", "defaults"]) +commands = {} + + +def command(*args, **kwargs): + defaults = kwargs.pop("defaults", {}) + + def decorator(func): + commands[args[0]] = Command(func, args, kwargs, defaults) + return func + + return decorator + + +def argument(*args, **kwargs): + def decorator(func): + if not hasattr(func, "args"): + func.args = [] + func.args.append((args, kwargs)) + return func + + return decorator + + +def format_taskgraph_labels(taskgraph): + return "\n".join( + sorted( + taskgraph.tasks[index].label for index in taskgraph.graph.visit_postorder() + ) + ) + + +def format_taskgraph_json(taskgraph): + return json.dumps( + taskgraph.to_json(), sort_keys=True, indent=2, separators=(",", ": ") + ) + + +def format_taskgraph_yaml(taskgraph): + return yaml.safe_dump(taskgraph.to_json(), default_flow_style=False) + + +def get_filtered_taskgraph(taskgraph, tasksregex, exclude_keys): + """ + Filter all the tasks on basis of a regular expression + and returns a new TaskGraph object + """ + from taskgraph.graph import Graph + from taskgraph.task import Task + from taskgraph.taskgraph import TaskGraph + + if tasksregex: + named_links_dict = taskgraph.graph.named_links_dict() + filteredtasks = {} + filterededges = set() + regexprogram = re.compile(tasksregex) + + for key in taskgraph.graph.visit_postorder(): + task = taskgraph.tasks[key] + if regexprogram.match(task.label): + filteredtasks[key] = task + for depname, dep in named_links_dict[key].items(): + if regexprogram.match(dep): + filterededges.add((key, dep, depname)) + + taskgraph = TaskGraph(filteredtasks, Graph(set(filteredtasks), filterededges)) + + if exclude_keys: + for label, task in taskgraph.tasks.items(): + task = task.to_json() + for key in exclude_keys: + obj = task + attrs = key.split(".") + while attrs[0] in obj: + if len(attrs) == 1: + del obj[attrs[0]] + break + obj = obj[attrs[0]] + attrs = attrs[1:] + taskgraph.tasks[label] = Task.from_json(task) + + return taskgraph + + +FORMAT_METHODS = { + "labels": format_taskgraph_labels, + "json": format_taskgraph_json, + "yaml": format_taskgraph_yaml, +} + + +def get_taskgraph_generator(root, parameters): + """Helper function to make testing a little easier.""" + from taskgraph.generator import TaskGraphGenerator + + return TaskGraphGenerator(root_dir=root, parameters=parameters) + + +def format_taskgraph(options, parameters, logfile=None): + import taskgraph + from taskgraph.parameters import parameters_loader + + if logfile: + handler = logging.FileHandler(logfile, mode="w") + if logging.root.handlers: + oldhandler = logging.root.handlers[-1] + logging.root.removeHandler(oldhandler) + handler.setFormatter(oldhandler.formatter) + logging.root.addHandler(handler) + + if options["fast"]: + taskgraph.fast = True + + if isinstance(parameters, str): + parameters = parameters_loader( + parameters, + overrides={"target-kinds": options.get("target_kinds")}, + strict=False, + ) + + tgg = get_taskgraph_generator(options.get("root"), parameters) + + tg = getattr(tgg, options["graph_attr"]) + tg = get_filtered_taskgraph(tg, options["tasks_regex"], options["exclude_keys"]) + format_method = FORMAT_METHODS[options["format"] or "labels"] + return format_method(tg) + + +def dump_output(out, path=None, params_spec=None): + from taskgraph.parameters import Parameters + + params_name = Parameters.format_spec(params_spec) + fh = None + if path: + # Substitute params name into file path if necessary + if params_spec and "{params}" not in path: + name, ext = os.path.splitext(path) + name += "_{params}" + path = name + ext + + path = path.format(params=params_name) + fh = open(path, "w") + else: + print( + f"Dumping result with parameters from {params_name}:", + file=sys.stderr, + ) + print(out + "\n", file=fh) + + +def generate_taskgraph(options, parameters, logdir): + from taskgraph.parameters import Parameters + + def logfile(spec): + """Determine logfile given a parameters specification.""" + if logdir is None: + return None + return os.path.join( + logdir, + "{}_{}.log".format(options["graph_attr"], Parameters.format_spec(spec)), + ) + + # Don't bother using futures if there's only one parameter. This can make + # tracebacks a little more readable and avoids additional process overhead. + if len(parameters) == 1: + spec = parameters[0] + out = format_taskgraph(options, spec, logfile(spec)) + dump_output(out, options["output_file"]) + return 0 + + futures = {} + with ProcessPoolExecutor(max_workers=options["max_workers"]) as executor: + for spec in parameters: + f = executor.submit(format_taskgraph, options, spec, logfile(spec)) + futures[f] = spec + + returncode = 0 + for future in as_completed(futures): + output_file = options["output_file"] + spec = futures[future] + e = future.exception() + if e: + returncode = 1 + out = "".join(traceback.format_exception(type(e), e, e.__traceback__)) + if options["diff"]: + # Dump to console so we don't accidentally diff the tracebacks. + output_file = None + else: + out = future.result() + + dump_output( + out, + path=output_file, + params_spec=spec if len(parameters) > 1 else None, + ) + + return returncode + + +@command( + "tasks", + help="Show all tasks in the taskgraph.", + defaults={"graph_attr": "full_task_set"}, +) +@command( + "full", help="Show the full taskgraph.", defaults={"graph_attr": "full_task_graph"} +) +@command( + "target", + help="Show the set of target tasks.", + defaults={"graph_attr": "target_task_set"}, +) +@command( + "target-graph", + help="Show the target graph.", + defaults={"graph_attr": "target_task_graph"}, +) +@command( + "optimized", + help="Show the optimized graph.", + defaults={"graph_attr": "optimized_task_graph"}, +) +@command( + "morphed", + help="Show the morphed graph.", + defaults={"graph_attr": "morphed_task_graph"}, +) +@argument("--root", "-r", help="root of the taskgraph definition relative to topsrcdir") +@argument("--quiet", "-q", action="store_true", help="suppress all logging output") +@argument( + "--verbose", "-v", action="store_true", help="include debug-level logging output" +) +@argument( + "--json", + "-J", + action="store_const", + dest="format", + const="json", + help="Output task graph as a JSON object", +) +@argument( + "--yaml", + "-Y", + action="store_const", + dest="format", + const="yaml", + help="Output task graph as a YAML object", +) +@argument( + "--labels", + "-L", + action="store_const", + dest="format", + const="labels", + help="Output the label for each task in the task graph (default)", +) +@argument( + "--parameters", + "-p", + default=None, + action="append", + help="Parameters to use for the generation. Can be a path to file (.yml or " + ".json; see `taskcluster/docs/parameters.rst`), a directory (containing " + "parameters files), a url, of the form `project=mozilla-central` to download " + "latest parameters file for the specified project from CI, or of the form " + "`task-id=<decision task id>` to download parameters from the specified " + "decision task. Can be specified multiple times, in which case multiple " + "generations will happen from the same invocation (one per parameters " + "specified).", +) +@argument( + "--no-optimize", + dest="optimize", + action="store_false", + default="true", + help="do not remove tasks from the graph that are found in the " + "index (a.k.a. optimize the graph)", +) +@argument( + "-o", + "--output-file", + default=None, + help="file path to store generated output.", +) +@argument( + "--tasks-regex", + "--tasks", + default=None, + help="only return tasks with labels matching this regular " "expression.", +) +@argument( + "--exclude-key", + default=None, + dest="exclude_keys", + action="append", + help="Exclude the specified key (using dot notation) from the final result. " + "This is mainly useful with '--diff' to filter out expected differences. Can be " + "used multiple times.", +) +@argument( + "-k", + "--target-kind", + dest="target_kinds", + action="append", + default=[], + help="only return tasks that are of the given kind, or their dependencies.", +) +@argument( + "-F", + "--fast", + default=False, + action="store_true", + help="enable fast task generation for local debugging.", +) +@argument( + "--diff", + const="default", + nargs="?", + default=None, + help="Generate and diff the current taskgraph against another revision. " + "Without args the base revision will be used. A revision specifier such as " + "the hash or `.~1` (hg) or `HEAD~1` (git) can be used as well.", +) +@argument( + "-j", + "--max-workers", + dest="max_workers", + default=None, + type=int, + help="The maximum number of workers to use for parallel operations such as" + "when multiple parameters files are passed.", +) +def show_taskgraph(options): + from taskgraph.parameters import Parameters, parameters_loader + from taskgraph.util.vcs import get_repository + + if options.pop("verbose", False): + logging.root.setLevel(logging.DEBUG) + + repo = None + cur_rev = None + diffdir = None + output_file = options["output_file"] + + if options["diff"]: + repo = get_repository(os.getcwd()) + + if not repo.working_directory_clean(): + print( + "abort: can't diff taskgraph with dirty working directory", + file=sys.stderr, + ) + return 1 + + # We want to return the working directory to the current state + # as best we can after we're done. In all known cases, using + # branch or bookmark (which are both available on the VCS object) + # as `branch` is preferable to a specific revision. + cur_rev = repo.branch or repo.head_rev[:12] + cur_rev_file = cur_rev.replace("/", "_") + + diffdir = tempfile.mkdtemp() + atexit.register( + shutil.rmtree, diffdir + ) # make sure the directory gets cleaned up + options["output_file"] = os.path.join( + diffdir, f"{options['graph_attr']}_{cur_rev_file}" + ) + print(f"Generating {options['graph_attr']} @ {cur_rev}", file=sys.stderr) + + parameters: List[Any[str, Parameters]] = options.pop("parameters") + if not parameters: + overrides = { + "target-kinds": options.get("target_kinds"), + } + parameters = [ + parameters_loader(None, strict=False, overrides=overrides) + ] # will use default values + + for param in parameters[:]: + if isinstance(param, str) and os.path.isdir(param): + parameters.remove(param) + parameters.extend( + [ + p.as_posix() + for p in Path(param).iterdir() + if p.suffix in (".yml", ".json") + ] + ) + + logdir = None + if len(parameters) > 1: + # Log to separate files for each process instead of stderr to + # avoid interleaving. + basename = os.path.basename(os.getcwd()) + logdir = os.path.join(appdirs.user_log_dir("taskgraph"), basename) + if not os.path.isdir(logdir): + os.makedirs(logdir) + else: + # Only setup logging if we have a single parameter spec. Otherwise + # logging will go to files. This is also used as a hook for Gecko + # to setup its `mach` based logging. + setup_logging() + + ret = generate_taskgraph(options, parameters, logdir) + + if options["diff"]: + assert diffdir is not None + assert repo is not None + + # Reload taskgraph modules to pick up changes and clear global state. + for mod in sys.modules.copy(): + if mod != __name__ and mod.split(".", 1)[0].endswith("taskgraph"): + del sys.modules[mod] + + if options["diff"] == "default": + base_rev = repo.base_rev + else: + base_rev = options["diff"] + base_rev_file = base_rev.replace("/", "_") + + try: + repo.update(base_rev) + base_rev = repo.head_rev[:12] + options["output_file"] = os.path.join( + diffdir, f"{options['graph_attr']}_{base_rev_file}" + ) + print(f"Generating {options['graph_attr']} @ {base_rev}", file=sys.stderr) + ret |= generate_taskgraph(options, parameters, logdir) + finally: + repo.update(cur_rev) + + # Generate diff(s) + diffcmd = [ + "diff", + "-U20", + "--report-identical-files", + f"--label={options['graph_attr']}@{base_rev}", + f"--label={options['graph_attr']}@{cur_rev}", + ] + + for spec in parameters: + base_path = os.path.join( + diffdir, f"{options['graph_attr']}_{base_rev_file}" + ) + cur_path = os.path.join(diffdir, f"{options['graph_attr']}_{cur_rev_file}") + + params_name = None + if len(parameters) > 1: + params_name = Parameters.format_spec(spec) + base_path += f"_{params_name}" + cur_path += f"_{params_name}" + + try: + proc = subprocess.run( + diffcmd + [base_path, cur_path], + capture_output=True, + text=True, + check=True, + ) + diff_output = proc.stdout + returncode = 0 + except subprocess.CalledProcessError as e: + # returncode 1 simply means diffs were found + if e.returncode != 1: + print(e.stderr, file=sys.stderr) + raise + diff_output = e.output + returncode = e.returncode + + dump_output( + diff_output, + # Don't bother saving file if no diffs were found. Log to + # console in this case instead. + path=None if returncode == 0 else output_file, + params_spec=spec if len(parameters) > 1 else None, + ) + + if options["format"] != "json": + print( + "If you were expecting differences in task bodies " + 'you should pass "-J"\n', + file=sys.stderr, + ) + + if len(parameters) > 1: + print(f"See '{logdir}' for logs", file=sys.stderr) + + return ret + + +@command("build-image", help="Build a Docker image") +@argument("image_name", help="Name of the image to build") +@argument( + "-t", "--tag", help="tag that the image should be built as.", metavar="name:tag" +) +@argument( + "--context-only", + help="File name the context tarball should be written to." + "with this option it will only build the context.tar.", + metavar="context.tar", +) +def build_image(args): + from taskgraph.docker import build_context, build_image + + validate_docker() + if args["context_only"] is None: + build_image(args["image_name"], args["tag"], os.environ) + else: + build_context(args["image_name"], args["context_only"], os.environ) + + +@command( + "load-image", + help="Load a pre-built Docker image. Note that you need to " + "have docker installed and running for this to work.", +) +@argument( + "--task-id", + help="Load the image at public/image.tar.zst in this task, " + "rather than searching the index", +) +@argument( + "-t", + "--tag", + help="tag that the image should be loaded as. If not " + "image will be loaded with tag from the tarball", + metavar="name:tag", +) +@argument( + "image_name", + nargs="?", + help="Load the image of this name based on the current " + "contents of the tree (as built for mozilla-central " + "or mozilla-inbound)", +) +def load_image(args): + from taskgraph.docker import load_image_by_name, load_image_by_task_id + + if not args.get("image_name") and not args.get("task_id"): + print("Specify either IMAGE-NAME or TASK-ID") + sys.exit(1) + validate_docker() + try: + if args["task_id"]: + ok = load_image_by_task_id(args["task_id"], args.get("tag")) + else: + ok = load_image_by_name(args["image_name"], args.get("tag")) + if not ok: + sys.exit(1) + except Exception: + traceback.print_exc() + sys.exit(1) + + +def validate_docker(): + p = subprocess.run(["docker", "ps"], capture_output=True) + if p.returncode != 0: + print("Error connecting to Docker:", p.stderr) + sys.exit(1) + + +@command("image-digest", help="Print the digest of a docker image.") +@argument( + "image_name", + help="Print the digest of the image of this name based on the current " + "contents of the tree.", +) +def image_digest(args): + from taskgraph.docker import get_image_digest + + try: + digest = get_image_digest(args["image_name"]) + print(digest) + except Exception: + traceback.print_exc() + sys.exit(1) + + +@command("decision", help="Run the decision task") +@argument("--root", "-r", help="root of the taskgraph definition relative to topsrcdir") +@argument( + "--message", + required=False, + help=argparse.SUPPRESS, +) +@argument( + "--project", + required=True, + help="Project to use for creating task graph. Example: --project=try", +) +@argument("--pushlog-id", dest="pushlog_id", required=True, default="0") +@argument("--pushdate", dest="pushdate", required=True, type=int, default=0) +@argument("--owner", required=True, help="email address of who owns this graph") +@argument("--level", required=True, help="SCM level of this repository") +@argument( + "--target-tasks-method", help="method for selecting the target tasks to generate" +) +@argument( + "--repository-type", + required=True, + help='Type of repository, either "hg" or "git"', +) +@argument("--base-repository", required=True, help='URL for "base" repository to clone') +@argument( + "--base-ref", default="", help='Reference of the revision in the "base" repository' +) +@argument( + "--base-rev", + default="", + help="Taskgraph decides what to do based on the revision range between " + "`--base-rev` and `--head-rev`. Value is determined automatically if not provided", +) +@argument( + "--head-repository", + required=True, + help='URL for "head" repository to fetch revision from', +) +@argument( + "--head-ref", required=True, help="Reference (this is same as rev usually for hg)" +) +@argument( + "--head-rev", required=True, help="Commit revision to use from head repository" +) +@argument("--head-tag", help="Tag attached to the revision", default="") +@argument( + "--tasks-for", required=True, help="the tasks_for value used to generate this task" +) +@argument("--try-task-config-file", help="path to try task configuration file") +def decision(options): + from taskgraph.decision import taskgraph_decision + + taskgraph_decision(options) + + +@command("action-callback", description="Run action callback used by action tasks") +@argument( + "--root", + "-r", + default="taskcluster/ci", + help="root of the taskgraph definition relative to topsrcdir", +) +def action_callback(options): + from taskgraph.actions import trigger_action_callback + from taskgraph.actions.util import get_parameters + + try: + # the target task for this action (or null if it's a group action) + task_id = json.loads(os.environ.get("ACTION_TASK_ID", "null")) + # the target task group for this action + task_group_id = os.environ.get("ACTION_TASK_GROUP_ID", None) + input = json.loads(os.environ.get("ACTION_INPUT", "null")) + callback = os.environ.get("ACTION_CALLBACK", None) + root = options["root"] + + parameters = get_parameters(task_group_id) + + return trigger_action_callback( + task_group_id=task_group_id, + task_id=task_id, + input=input, + callback=callback, + parameters=parameters, + root=root, + test=False, + ) + except Exception: + traceback.print_exc() + sys.exit(1) + + +@command("test-action-callback", description="Run an action callback in a testing mode") +@argument( + "--root", + "-r", + default="taskcluster/ci", + help="root of the taskgraph definition relative to topsrcdir", +) +@argument( + "--parameters", + "-p", + default="", + help="parameters file (.yml or .json; see " "`taskcluster/docs/parameters.rst`)`", +) +@argument("--task-id", default=None, help="TaskId to which the action applies") +@argument( + "--task-group-id", default=None, help="TaskGroupId to which the action applies" +) +@argument("--input", default=None, help="Action input (.yml or .json)") +@argument("callback", default=None, help="Action callback name (Python function name)") +def test_action_callback(options): + import taskgraph.actions + import taskgraph.parameters + from taskgraph.config import load_graph_config + from taskgraph.util import yaml + + def load_data(filename): + with open(filename) as f: + if filename.endswith(".yml"): + return yaml.load_stream(f) + elif filename.endswith(".json"): + return json.load(f) + else: + raise Exception(f"unknown filename {filename}") + + try: + task_id = options["task_id"] + + if options["input"]: + input = load_data(options["input"]) + else: + input = None + + root = options["root"] + graph_config = load_graph_config(root) + trust_domain = graph_config["trust-domain"] + graph_config.register() + + parameters = taskgraph.parameters.load_parameters_file( + options["parameters"], strict=False, trust_domain=trust_domain + ) + parameters.check() + + return taskgraph.actions.trigger_action_callback( + task_group_id=options["task_group_id"], + task_id=task_id, + input=input, + callback=options["callback"], + parameters=parameters, + root=root, + test=True, + ) + except Exception: + traceback.print_exc() + sys.exit(1) + + +@command( + "init", description="Initialize a new Taskgraph setup in a new or existing project." +) +@argument( + "-f", + "--force", + action="store_true", + default=False, + help="Bypass safety checks.", +) +@argument( + "--prompt", + dest="no_input", + action="store_false", + default=True, + help="Prompt for input rather than using default values (advanced).", +) +@argument( + "--template", + default="gh:taskcluster/taskgraph", + help=argparse.SUPPRESS, # used for testing +) +def init_taskgraph(options): + from cookiecutter.main import cookiecutter + + import taskgraph + from taskgraph.util.vcs import get_repository + + repo = get_repository(os.getcwd()) + root = Path(repo.path) + + # Clean up existing installations if necessary. + tc_yml = root.joinpath(".taskcluster.yml") + if tc_yml.is_file(): + if not options["force"]: + proceed = input( + "A Taskcluster setup already exists in this repository, " + "would you like to overwrite it? [y/N]: " + ).lower() + while proceed not in ("y", "yes", "n", "no"): + proceed = input(f"Invalid option '{proceed}'! Try again: ") + + if proceed[0] == "n": + sys.exit(1) + + tc_yml.unlink() + tg_dir = root.joinpath("taskcluster") + if tg_dir.is_dir(): + shutil.rmtree(tg_dir) + + # Populate some defaults from the current repository. + context = {"project_name": root.name} + + try: + repo_url = repo.get_url(remote=repo.remote_name) + except RuntimeError: + repo_url = "" + + if repo.tool == "git" and "github.com" in repo_url: + context["repo_host"] = "github" + elif repo.tool == "hg" and "hg.mozilla.org" in repo_url: + context["repo_host"] = "hgmo" + else: + print( + dedent( + """\ + Repository not supported! + + Taskgraph only supports repositories hosted on Github or hg.mozilla.org. + Ensure you have a remote that points to one of these locations. + """ + ), + file=sys.stderr, + ) + return 1 + + # Generate the project. + cookiecutter( + options["template"], + checkout=taskgraph.__version__, + directory="template", + extra_context=context, + no_input=options["no_input"], + output_dir=root.parent, + overwrite_if_exists=True, + ) + + +def create_parser(): + parser = argparse.ArgumentParser(description="Interact with taskgraph") + subparsers = parser.add_subparsers() + for _, (func, args, kwargs, defaults) in commands.items(): + subparser = subparsers.add_parser(*args, **kwargs) + for arg in func.args: + subparser.add_argument(*arg[0], **arg[1]) + subparser.set_defaults(command=func, **defaults) + return parser + + +def setup_logging(): + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO + ) + + +def main(args=sys.argv[1:]): + setup_logging() + parser = create_parser() + args = parser.parse_args(args) + try: + return args.command(vars(args)) + except Exception: + traceback.print_exc() + sys.exit(1) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/morph.py b/third_party/python/taskcluster_taskgraph/taskgraph/morph.py new file mode 100644 index 0000000000..bfa1560270 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/morph.py @@ -0,0 +1,261 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Graph morphs are modifications to task-graphs that take place *after* the +optimization phase. + +These graph morphs are largely invisible to developers running `./mach` +locally, so they should be limited to changes that do not modify the meaning of +the graph. +""" + +# Note that the translation of `{'task-reference': '..'}` and +# `artifact-reference` are handled in the optimization phase (since +# optimization involves dealing with taskIds directly). Similarly, +# `{'relative-datestamp': '..'}` is handled at the last possible moment during +# task creation. + + +import logging +import os +import re + +from slugid import nice as slugid + +from .graph import Graph +from .task import Task +from .taskgraph import TaskGraph +from .util.workertypes import get_worker_type + +here = os.path.abspath(os.path.dirname(__file__)) +logger = logging.getLogger(__name__) +MAX_ROUTES = 10 + +registered_morphs = [] + + +def register_morph(func): + registered_morphs.append(func) + + +def amend_taskgraph(taskgraph, label_to_taskid, to_add): + """Add the given tasks to the taskgraph, returning a new taskgraph""" + new_tasks = taskgraph.tasks.copy() + new_edges = set(taskgraph.graph.edges) + for task in to_add: + new_tasks[task.task_id] = task + assert task.label not in label_to_taskid + label_to_taskid[task.label] = task.task_id + for depname, dep in task.dependencies.items(): + new_edges.add((task.task_id, dep, depname)) + + taskgraph = TaskGraph(new_tasks, Graph(set(new_tasks), new_edges)) + return taskgraph, label_to_taskid + + +def derive_index_task(task, taskgraph, label_to_taskid, parameters, graph_config): + """Create the shell of a task that depends on `task` and on the given docker + image.""" + purpose = "index-task" + label = f"{purpose}-{task.label}" + provisioner_id, worker_type = get_worker_type( + graph_config, "misc", parameters["level"] + ) + + task_def = { + "provisionerId": provisioner_id, + "workerType": worker_type, + "dependencies": [task.task_id], + "created": {"relative-datestamp": "0 seconds"}, + "deadline": task.task["deadline"], + # no point existing past the parent task's deadline + "expires": task.task["deadline"], + "metadata": { + "name": label, + "description": "{} for {}".format( + purpose, task.task["metadata"]["description"] + ), + "owner": task.task["metadata"]["owner"], + "source": task.task["metadata"]["source"], + }, + "scopes": [], + "payload": { + "image": { + "path": "public/image.tar.zst", + "namespace": "taskgraph.cache.level-3.docker-images.v2.index-task.latest", + "type": "indexed-image", + }, + "features": { + "taskclusterProxy": True, + }, + "maxRunTime": 600, + }, + } + + # only include the docker-image dependency here if it is actually in the + # taskgraph (has not been optimized). It is included in + # task_def['dependencies'] unconditionally. + dependencies = {"parent": task.task_id} + + task = Task( + kind="misc", + label=label, + attributes={}, + task=task_def, + dependencies=dependencies, + ) + task.task_id = slugid() + return task, taskgraph, label_to_taskid + + +def make_index_task(parent_task, taskgraph, label_to_taskid, parameters, graph_config): + index_paths = [ + r.split(".", 1)[1] for r in parent_task.task["routes"] if r.startswith("index.") + ] + parent_task.task["routes"] = [ + r for r in parent_task.task["routes"] if not r.startswith("index.") + ] + + task, taskgraph, label_to_taskid = derive_index_task( + parent_task, taskgraph, label_to_taskid, parameters, graph_config + ) + + # we need to "summarize" the scopes, otherwise a particularly + # namespace-heavy index task might have more scopes than can fit in a + # temporary credential. + scopes = set() + domain_index_regex = re.compile( + r"({trust_domain}\.v2\.[^.]*\.).*".format( + trust_domain=re.escape(graph_config["trust-domain"]) + ) + ) + index_path_res = [domain_index_regex] + for path in graph_config["taskgraph"].get("index-path-regexes", ()): + index_path_res.append(re.compile(path)) + for path in index_paths: + for index_path_re in index_path_res: + match = index_path_re.match(path) + if match: + path = match.group(1) + "*" + break + scope = f"index:insert-task:{path}" + scopes.add(scope) + task.task["scopes"] = sorted(scopes) + + task.task["payload"]["command"] = ["insert-indexes.js"] + index_paths + task.task["payload"]["env"] = { + "TARGET_TASKID": parent_task.task_id, + "INDEX_RANK": parent_task.task.get("extra", {}).get("index", {}).get("rank", 0), + } + return task, taskgraph, label_to_taskid + + +@register_morph +def add_index_tasks(taskgraph, label_to_taskid, parameters, graph_config): + """ + The TaskCluster queue only allows 10 routes on a task, but we have tasks + with many more routes, for purposes of indexing. This graph morph adds + "index tasks" that depend on such tasks and do the index insertions + directly, avoiding the limits on task.routes. + """ + logger.debug("Morphing: adding index tasks") + + added = [] + for label, task in taskgraph.tasks.items(): + if len(task.task.get("routes", [])) <= MAX_ROUTES: + continue + task, taskgraph, label_to_taskid = make_index_task( + task, taskgraph, label_to_taskid, parameters, graph_config + ) + added.append(task) + + if added: + taskgraph, label_to_taskid = amend_taskgraph(taskgraph, label_to_taskid, added) + logger.info(f"Added {len(added)} index tasks") + + return taskgraph, label_to_taskid + + +def _get_morph_url(): + """ + Guess a URL for the current file, for source metadata for created tasks. + + If we checked out the taskgraph code with run-task in the decision task, + we can use TASKGRAPH_* to find the right version, which covers the + existing use case. + """ + taskgraph_repo = os.environ.get( + "TASKGRAPH_HEAD_REPOSITORY", "https://github.com/taskcluster/taskgraph" + ) + taskgraph_rev = os.environ.get("TASKGRAPH_HEAD_REV", "default") + return f"{taskgraph_repo}/raw-file/{taskgraph_rev}/src/taskgraph/morph.py" + + +@register_morph +def add_code_review_task(taskgraph, label_to_taskid, parameters, graph_config): + logger.debug("Morphing: adding code review task") + + review_config = parameters.get("code-review") + if not review_config: + return taskgraph, label_to_taskid + + code_review_tasks = {} + for label, task in taskgraph.tasks.items(): + if task.attributes.get("code-review"): + code_review_tasks[task.label] = task.task_id + + if code_review_tasks: + code_review_task_def = { + "provisionerId": "built-in", + "workerType": "succeed", + "dependencies": sorted(code_review_tasks.values()), + # This option permits to run the task + # regardless of the dependencies tasks exit status + # as we are interested in the task failures + "requires": "all-resolved", + "created": {"relative-datestamp": "0 seconds"}, + "deadline": {"relative-datestamp": "1 day"}, + # no point existing past the parent task's deadline + "expires": {"relative-datestamp": "1 day"}, + "metadata": { + "name": "code-review", + "description": "List all issues found in static analysis and linting tasks", + "owner": parameters["owner"], + "source": _get_morph_url(), + }, + "scopes": [], + "payload": {}, + "routes": ["project.relman.codereview.v1.try_ending"], + "extra": { + "code-review": { + "phabricator-build-target": review_config[ + "phabricator-build-target" + ], + "repository": parameters["head_repository"], + "revision": parameters["head_rev"], + } + }, + } + task = Task( + kind="misc", + label="code-review", + attributes={}, + task=code_review_task_def, + dependencies=code_review_tasks, + ) + task.task_id = slugid() + taskgraph, label_to_taskid = amend_taskgraph(taskgraph, label_to_taskid, [task]) + logger.info("Added code review task.") + + return taskgraph, label_to_taskid + + +def morph(taskgraph, label_to_taskid, parameters, graph_config): + """Apply all morphs""" + for m in registered_morphs: + taskgraph, label_to_taskid = m( + taskgraph, label_to_taskid, parameters, graph_config + ) + return taskgraph, label_to_taskid diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/optimize/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/optimize/__init__.py new file mode 100644 index 0000000000..06287d877d --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/optimize/__init__.py @@ -0,0 +1,8 @@ +from .base import ( # noqa: F401 + Alias, + All, + Any, + Not, + OptimizationStrategy, + register_strategy, +) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/optimize/base.py b/third_party/python/taskcluster_taskgraph/taskgraph/optimize/base.py new file mode 100644 index 0000000000..367b94e1de --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/optimize/base.py @@ -0,0 +1,551 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +The objective of optimization is to remove as many tasks from the graph as +possible, as efficiently as possible, thereby delivering useful results as +quickly as possible. For example, ideally if only a test script is modified in +a push, then the resulting graph contains only the corresponding test suite +task. + +See ``taskcluster/docs/optimization.rst`` for more information. +""" + +import datetime +import logging +from abc import ABCMeta, abstractmethod, abstractproperty +from collections import defaultdict + +from slugid import nice as slugid + +from taskgraph.graph import Graph +from taskgraph.taskgraph import TaskGraph +from taskgraph.util.parameterization import resolve_task_references, resolve_timestamps +from taskgraph.util.python_path import import_sibling_modules + +logger = logging.getLogger(__name__) +registry = {} + + +def register_strategy(name, args=()): + def wrap(cls): + if name not in registry: + registry[name] = cls(*args) + if not hasattr(registry[name], "description"): + registry[name].description = name + return cls + + return wrap + + +def optimize_task_graph( + target_task_graph, + requested_tasks, + params, + do_not_optimize, + decision_task_id, + existing_tasks=None, + strategy_override=None, +): + """ + Perform task optimization, returning a taskgraph and a map from label to + assigned taskId, including replacement tasks. + """ + label_to_taskid = {} + if not existing_tasks: + existing_tasks = {} + + # instantiate the strategies for this optimization process + strategies = registry.copy() + if strategy_override: + strategies.update(strategy_override) + + optimizations = _get_optimizations(target_task_graph, strategies) + + removed_tasks = remove_tasks( + target_task_graph=target_task_graph, + requested_tasks=requested_tasks, + optimizations=optimizations, + params=params, + do_not_optimize=do_not_optimize, + ) + + replaced_tasks = replace_tasks( + target_task_graph=target_task_graph, + optimizations=optimizations, + params=params, + do_not_optimize=do_not_optimize, + label_to_taskid=label_to_taskid, + existing_tasks=existing_tasks, + removed_tasks=removed_tasks, + ) + + return ( + get_subgraph( + target_task_graph, + removed_tasks, + replaced_tasks, + label_to_taskid, + decision_task_id, + ), + label_to_taskid, + ) + + +def _get_optimizations(target_task_graph, strategies): + def optimizations(label): + task = target_task_graph.tasks[label] + if task.optimization: + opt_by, arg = list(task.optimization.items())[0] + strategy = strategies[opt_by] + if hasattr(strategy, "description"): + opt_by += f" ({strategy.description})" + return (opt_by, strategy, arg) + else: + return ("never", strategies["never"], None) + + return optimizations + + +def _log_optimization(verb, opt_counts, opt_reasons=None): + if opt_reasons: + message = "optimize: {label} {action} because of {reason}" + for label, (action, reason) in opt_reasons.items(): + logger.debug(message.format(label=label, action=action, reason=reason)) + + if opt_counts: + logger.info( + f"{verb.title()} " + + ", ".join(f"{c} tasks by {b}" for b, c in sorted(opt_counts.items())) + + " during optimization." + ) + else: + logger.info(f"No tasks {verb} during optimization") + + +def remove_tasks( + target_task_graph, requested_tasks, params, optimizations, do_not_optimize +): + """ + Implement the "Removing Tasks" phase, returning a set of task labels of all removed tasks. + """ + opt_counts = defaultdict(int) + opt_reasons = {} + removed = set() + dependents_of = target_task_graph.graph.reverse_links_dict() + tasks = target_task_graph.tasks + prune_candidates = set() + + # Traverse graph so dependents (child nodes) are guaranteed to be processed + # first. + for label in target_task_graph.graph.visit_preorder(): + # Dependents that can be pruned away (shouldn't cause this task to run). + # Only dependents that either: + # A) Explicitly reference this task in their 'if_dependencies' list, or + # B) Don't have an 'if_dependencies' attribute (i.e are in 'prune_candidates' + # because they should be removed but have prune_deps themselves) + # should be considered. + prune_deps = { + l + for l in dependents_of[label] + if l in prune_candidates + if not tasks[l].if_dependencies or label in tasks[l].if_dependencies + } + + def _keep(reason): + """Mark a task as being kept in the graph. Also recursively removes + any dependents from `prune_candidates`, assuming they should be + kept because of this task. + """ + opt_reasons[label] = ("kept", reason) + + # Removes dependents that were in 'prune_candidates' from a task + # that ended up being kept (and therefore the dependents should + # also be kept). + queue = list(prune_deps) + while queue: + l = queue.pop() + + # If l is a prune_dep of multiple tasks it could be queued up + # multiple times. Guard against it being already removed. + if l not in prune_candidates: + continue + + # If a task doesn't set 'if_dependencies' itself (rather it was + # added to 'prune_candidates' due to one of its depenendents), + # then we shouldn't remove it. + if not tasks[l].if_dependencies: + continue + + prune_candidates.remove(l) + queue.extend([r for r in dependents_of[l] if r in prune_candidates]) + + def _remove(reason): + """Potentially mark a task as being removed from the graph. If the + task has dependents that can be pruned, add this task to + `prune_candidates` rather than removing it. + """ + if prune_deps: + # If there are prune_deps, unsure if we can remove this task yet. + prune_candidates.add(label) + else: + opt_reasons[label] = ("removed", reason) + opt_counts[reason] += 1 + removed.add(label) + + # if we're not allowed to optimize, that's easy.. + if label in do_not_optimize: + _keep("do not optimize") + continue + + # If there are remaining tasks depending on this one, do not remove. + if any( + l for l in dependents_of[label] if l not in removed and l not in prune_deps + ): + _keep("dependent tasks") + continue + + # Some tasks in the task graph only exist because they were required + # by a task that has just been optimized away. They can now be removed. + if label not in requested_tasks: + _remove("dependents optimized") + continue + + # Call the optimization strategy. + task = tasks[label] + opt_by, opt, arg = optimizations(label) + if opt.should_remove_task(task, params, arg): + _remove(opt_by) + continue + + # Some tasks should only run if their dependency was also run. Since we + # haven't processed dependencies yet, we add them to a list of + # candidate tasks for pruning. + if task.if_dependencies: + opt_reasons[label] = ("kept", opt_by) + prune_candidates.add(label) + else: + _keep(opt_by) + + if prune_candidates: + reason = "if-dependencies pruning" + for label in prune_candidates: + # There's an edge case where a triangle graph can cause a + # dependency to stay in 'prune_candidates' when the dependent + # remains. Do a final check to ensure we don't create any bad + # edges. + dependents = any( + d + for d in dependents_of[label] + if d not in prune_candidates + if d not in removed + ) + if dependents: + opt_reasons[label] = ("kept", "dependent tasks") + continue + removed.add(label) + opt_counts[reason] += 1 + opt_reasons[label] = ("removed", reason) + + _log_optimization("removed", opt_counts, opt_reasons) + return removed + + +def replace_tasks( + target_task_graph, + params, + optimizations, + do_not_optimize, + label_to_taskid, + removed_tasks, + existing_tasks, +): + """ + Implement the "Replacing Tasks" phase, returning a set of task labels of + all replaced tasks. The replacement taskIds are added to label_to_taskid as + a side-effect. + """ + opt_counts = defaultdict(int) + replaced = set() + dependents_of = target_task_graph.graph.reverse_links_dict() + dependencies_of = target_task_graph.graph.links_dict() + + for label in target_task_graph.graph.visit_postorder(): + # if we're not allowed to optimize, that's easy.. + if label in do_not_optimize: + continue + + # if this task depends on un-replaced, un-removed tasks, do not replace + if any( + l not in replaced and l not in removed_tasks for l in dependencies_of[label] + ): + continue + + # if the task already exists, that's an easy replacement + repl = existing_tasks.get(label) + if repl: + label_to_taskid[label] = repl + replaced.add(label) + opt_counts["existing_tasks"] += 1 + continue + + # call the optimization strategy + task = target_task_graph.tasks[label] + opt_by, opt, arg = optimizations(label) + + # compute latest deadline of dependents (if any) + dependents = [target_task_graph.tasks[l] for l in dependents_of[label]] + deadline = None + if dependents: + now = datetime.datetime.utcnow() + deadline = max( + resolve_timestamps(now, task.task["deadline"]) for task in dependents + ) + repl = opt.should_replace_task(task, params, deadline, arg) + if repl: + if repl is True: + # True means remove this task; get_subgraph will catch any + # problems with removed tasks being depended on + removed_tasks.add(label) + else: + label_to_taskid[label] = repl + replaced.add(label) + opt_counts[opt_by] += 1 + continue + + _log_optimization("replaced", opt_counts) + return replaced + + +def get_subgraph( + target_task_graph, + removed_tasks, + replaced_tasks, + label_to_taskid, + decision_task_id, +): + """ + Return the subgraph of target_task_graph consisting only of + non-optimized tasks and edges between them. + + To avoid losing track of taskIds for tasks optimized away, this method + simultaneously substitutes real taskIds for task labels in the graph, and + populates each task definition's `dependencies` key with the appropriate + taskIds. Task references are resolved in the process. + """ + + # check for any dependency edges from included to removed tasks + bad_edges = [ + (l, r, n) + for l, r, n in target_task_graph.graph.edges + if l not in removed_tasks and r in removed_tasks + ] + if bad_edges: + probs = ", ".join( + f"{l} depends on {r} as {n} but it has been removed" + for l, r, n in bad_edges + ) + raise Exception("Optimization error: " + probs) + + # fill in label_to_taskid for anything not removed or replaced + assert replaced_tasks <= set(label_to_taskid) + for label in sorted( + target_task_graph.graph.nodes - removed_tasks - set(label_to_taskid) + ): + label_to_taskid[label] = slugid() + + # resolve labels to taskIds and populate task['dependencies'] + tasks_by_taskid = {} + named_links_dict = target_task_graph.graph.named_links_dict() + omit = removed_tasks | replaced_tasks + for label, task in target_task_graph.tasks.items(): + if label in omit: + continue + task.task_id = label_to_taskid[label] + named_task_dependencies = { + name: label_to_taskid[label] + for name, label in named_links_dict.get(label, {}).items() + } + + # Add remaining soft dependencies + if task.soft_dependencies: + named_task_dependencies.update( + { + label: label_to_taskid[label] + for label in task.soft_dependencies + if label in label_to_taskid and label not in omit + } + ) + + task.task = resolve_task_references( + task.label, + task.task, + task_id=task.task_id, + decision_task_id=decision_task_id, + dependencies=named_task_dependencies, + ) + deps = task.task.setdefault("dependencies", []) + deps.extend(sorted(named_task_dependencies.values())) + tasks_by_taskid[task.task_id] = task + + # resolve edges to taskIds + edges_by_taskid = ( + (label_to_taskid.get(left), label_to_taskid.get(right), name) + for (left, right, name) in target_task_graph.graph.edges + ) + # ..and drop edges that are no longer entirely in the task graph + # (note that this omits edges to replaced tasks, but they are still in task.dependnecies) + edges_by_taskid = { + (left, right, name) + for (left, right, name) in edges_by_taskid + if left in tasks_by_taskid and right in tasks_by_taskid + } + + return TaskGraph(tasks_by_taskid, Graph(set(tasks_by_taskid), edges_by_taskid)) + + +@register_strategy("never") +class OptimizationStrategy: + def should_remove_task(self, task, params, arg): + """Determine whether to optimize this task by removing it. Returns + True to remove.""" + return False + + def should_replace_task(self, task, params, deadline, arg): + """Determine whether to optimize this task by replacing it. Returns a + taskId to replace this task, True to replace with nothing, or False to + keep the task.""" + return False + + +@register_strategy("always") +class Always(OptimizationStrategy): + def should_remove_task(self, task, params, arg): + return True + + +class CompositeStrategy(OptimizationStrategy, metaclass=ABCMeta): + def __init__(self, *substrategies, **kwargs): + self.substrategies = [] + missing = set() + for sub in substrategies: + if isinstance(sub, str): + if sub not in registry.keys(): + missing.add(sub) + continue + sub = registry[sub] + + self.substrategies.append(sub) + + if missing: + raise TypeError( + "substrategies aren't registered: {}".format( + ", ".join(sorted(missing)) + ) + ) + + self.split_args = kwargs.pop("split_args", None) + if not self.split_args: + self.split_args = lambda arg, substrategies: [arg] * len(substrategies) + if kwargs: + raise TypeError("unexpected keyword args") + + @abstractproperty + def description(self): + """A textual description of the combined substrategies.""" + + @abstractmethod + def reduce(self, results): + """Given all substrategy results as a generator, return the overall + result.""" + + def _generate_results(self, fname, *args): + *passthru, arg = args + for sub, arg in zip( + self.substrategies, self.split_args(arg, self.substrategies) + ): + yield getattr(sub, fname)(*passthru, arg) + + def should_remove_task(self, *args): + results = self._generate_results("should_remove_task", *args) + return self.reduce(results) + + def should_replace_task(self, *args): + results = self._generate_results("should_replace_task", *args) + return self.reduce(results) + + +class Any(CompositeStrategy): + """Given one or more optimization strategies, remove or replace a task if any of them + says to. + + Replacement will use the value returned by the first strategy that says to replace. + """ + + @property + def description(self): + return "-or-".join([s.description for s in self.substrategies]) + + @classmethod + def reduce(cls, results): + for rv in results: + if rv: + return rv + return False + + +class All(CompositeStrategy): + """Given one or more optimization strategies, remove or replace a task if all of them + says to. + + Replacement will use the value returned by the first strategy passed in. + Note the values used for replacement need not be the same, as long as they + all say to replace. + """ + + @property + def description(self): + return "-and-".join([s.description for s in self.substrategies]) + + @classmethod + def reduce(cls, results): + for rv in results: + if not rv: + return rv + return True + + +class Alias(CompositeStrategy): + """Provides an alias to an existing strategy. + + This can be useful to swap strategies in and out without needing to modify + the task transforms. + """ + + def __init__(self, strategy): + super().__init__(strategy) + + @property + def description(self): + return self.substrategies[0].description + + def reduce(self, results): + return next(results) + + +class Not(CompositeStrategy): + """Given a strategy, returns the opposite.""" + + def __init__(self, strategy): + super().__init__(strategy) + + @property + def description(self): + return "not-" + self.substrategies[0].description + + def reduce(self, results): + return not next(results) + + +# Trigger registration in sibling modules. +import_sibling_modules() diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/optimize/strategies.py b/third_party/python/taskcluster_taskgraph/taskgraph/optimize/strategies.py new file mode 100644 index 0000000000..973b550632 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/optimize/strategies.py @@ -0,0 +1,64 @@ +import logging +from datetime import datetime + +from taskgraph import files_changed +from taskgraph.optimize.base import OptimizationStrategy, register_strategy +from taskgraph.util.taskcluster import find_task_id, status_task + +logger = logging.getLogger(__name__) + + +@register_strategy("index-search") +class IndexSearch(OptimizationStrategy): + # A task with no dependencies remaining after optimization will be replaced + # if artifacts exist for the corresponding index_paths. + # Otherwise, we're in one of the following cases: + # - the task has un-optimized dependencies + # - the artifacts have expired + # - some changes altered the index_paths and new artifacts need to be + # created. + # In every of those cases, we need to run the task to create or refresh + # artifacts. + + fmt = "%Y-%m-%dT%H:%M:%S.%fZ" + + def should_replace_task(self, task, params, deadline, index_paths): + "Look for a task with one of the given index paths" + for index_path in index_paths: + try: + task_id = find_task_id(index_path) + status = status_task(task_id) + # status can be `None` if we're in `testing` mode + # (e.g. test-action-callback) + if not status or status.get("state") in ("exception", "failed"): + continue + + if deadline and datetime.strptime( + status["expires"], self.fmt + ) < datetime.strptime(deadline, self.fmt): + continue + + return task_id + except KeyError: + # 404 will end up here and go on to the next index path + pass + + return False + + +@register_strategy("skip-unless-changed") +class SkipUnlessChanged(OptimizationStrategy): + def should_remove_task(self, task, params, file_patterns): + # pushlog_id == -1 - this is the case when run from a cron.yml job or on a git repository + if params.get("repository_type") == "hg" and params.get("pushlog_id") == -1: + return False + + changed = files_changed.check(params, file_patterns) + if not changed: + logger.debug( + 'no files found matching a pattern in `skip-unless-changed` for "{}"'.format( + task.label + ) + ) + return True + return False diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/parameters.py b/third_party/python/taskcluster_taskgraph/taskgraph/parameters.py new file mode 100644 index 0000000000..48571d97ad --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/parameters.py @@ -0,0 +1,376 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import gzip +import hashlib +import json +import os +import time +from datetime import datetime +from io import BytesIO +from pprint import pformat +from subprocess import CalledProcessError +from unittest.mock import Mock +from urllib.parse import urlparse +from urllib.request import urlopen + +import mozilla_repo_urls +from voluptuous import ALLOW_EXTRA, Any, Optional, Required, Schema + +from taskgraph.util import yaml +from taskgraph.util.readonlydict import ReadOnlyDict +from taskgraph.util.schema import validate_schema +from taskgraph.util.taskcluster import find_task_id, get_artifact_url +from taskgraph.util.vcs import get_repository + + +class ParameterMismatch(Exception): + """Raised when a parameters.yml has extra or missing parameters.""" + + +# Please keep this list sorted and in sync with docs/reference/parameters.rst +base_schema = Schema( + { + Required("base_repository"): str, + Required("base_ref"): str, + Required("base_rev"): str, + Required("build_date"): int, + Required("build_number"): int, + Required("do_not_optimize"): [str], + Required("enable_always_target"): Any(bool, [str]), + Required("existing_tasks"): {str: str}, + Required("filters"): [str], + Required("head_ref"): str, + Required("head_repository"): str, + Required("head_rev"): str, + Required("head_tag"): str, + Required("level"): str, + Required("moz_build_date"): str, + Required("next_version"): Any(str, None), + Required("optimize_strategies"): Any(str, None), + Required("optimize_target_tasks"): bool, + Required("owner"): str, + Required("project"): str, + Required("pushdate"): int, + Required("pushlog_id"): str, + Required("repository_type"): str, + # target-kinds is not included, since it should never be + # used at run-time + Required("target_tasks_method"): str, + Required("tasks_for"): str, + Required("version"): Any(str, None), + Optional("code-review"): { + Required("phabricator-build-target"): str, + }, + } +) + + +def get_contents(path): + with open(path) as fh: + contents = fh.readline().rstrip() + return contents + + +def get_version(repo_path): + version_path = os.path.join(repo_path, "version.txt") + return get_contents(version_path) if os.path.isfile(version_path) else None + + +def _get_defaults(repo_root=None): + repo_path = repo_root or os.getcwd() + try: + repo = get_repository(repo_path) + except RuntimeError: + # Use fake values if no repo is detected. + repo = Mock(branch="", head_rev="", tool="git") + repo.get_url.return_value = "" + + try: + repo_url = repo.get_url() + parsed_url = mozilla_repo_urls.parse(repo_url) + project = parsed_url.repo_name + except ( + CalledProcessError, + mozilla_repo_urls.errors.InvalidRepoUrlError, + mozilla_repo_urls.errors.UnsupportedPlatformError, + ): + repo_url = "" + project = "" + + return { + "base_repository": repo_url, + "base_ref": "", + "base_rev": "", + "build_date": int(time.time()), + "build_number": 1, + "do_not_optimize": [], + "enable_always_target": True, + "existing_tasks": {}, + "filters": ["target_tasks_method"], + "head_ref": repo.branch or repo.head_rev, + "head_repository": repo_url, + "head_rev": repo.head_rev, + "head_tag": "", + "level": "3", + "moz_build_date": datetime.now().strftime("%Y%m%d%H%M%S"), + "next_version": None, + "optimize_strategies": None, + "optimize_target_tasks": True, + "owner": "nobody@mozilla.com", + "project": project, + "pushdate": int(time.time()), + "pushlog_id": "0", + "repository_type": repo.tool, + "target_tasks_method": "default", + "tasks_for": "", + "version": get_version(repo_path), + } + + +defaults_functions = [_get_defaults] + + +def extend_parameters_schema(schema, defaults_fn=None): + """ + Extend the schema for parameters to include per-project configuration. + + This should be called by the `taskgraph.register` function in the + graph-configuration. + + Args: + schema (Schema): The voluptuous.Schema object used to describe extended + parameters. + defaults_fn (function): A function which takes no arguments and returns a + dict mapping parameter name to default value in the + event strict=False (optional). + """ + global base_schema + global defaults_functions + base_schema = base_schema.extend(schema) + if defaults_fn: + defaults_functions.append(defaults_fn) + + +class Parameters(ReadOnlyDict): + """An immutable dictionary with nicer KeyError messages on failure""" + + def __init__(self, strict=True, repo_root=None, **kwargs): + self.strict = strict + self.spec = kwargs.pop("spec", None) + self._id = None + + if not self.strict: + # apply defaults to missing parameters + kwargs = Parameters._fill_defaults(repo_root=repo_root, **kwargs) + + ReadOnlyDict.__init__(self, **kwargs) + + @property + def id(self): + if not self._id: + self._id = hashlib.sha256( + json.dumps(self, sort_keys=True).encode("utf-8") + ).hexdigest()[:12] + + return self._id + + @staticmethod + def format_spec(spec): + """ + Get a friendly identifier from a parameters specifier. + + Args: + spec (str): Parameters specifier. + + Returns: + str: Name to identify parameters by. + """ + if spec is None: + return "defaults" + + if any(spec.startswith(s) for s in ("task-id=", "project=")): + return spec + + result = urlparse(spec) + if result.scheme in ("http", "https"): + spec = result.path + + return os.path.splitext(os.path.basename(spec))[0] + + @staticmethod + def _fill_defaults(repo_root=None, **kwargs): + defaults = {} + for fn in defaults_functions: + defaults.update(fn(repo_root)) + + for name, default in defaults.items(): + if name not in kwargs: + kwargs[name] = default + return kwargs + + def check(self): + schema = ( + base_schema if self.strict else base_schema.extend({}, extra=ALLOW_EXTRA) + ) + try: + validate_schema(schema, self.copy(), "Invalid parameters:") + except Exception as e: + raise ParameterMismatch(str(e)) + + def __getitem__(self, k): + try: + return super().__getitem__(k) + except KeyError: + raise KeyError(f"taskgraph parameter {k!r} not found") + + def is_try(self): + """ + Determine whether this graph is being built on a try project or for + `mach try fuzzy`. + """ + return "try" in self["project"] or self["tasks_for"] == "github-pull-request" + + @property + def moz_build_date(self): + # XXX self["moz_build_date"] is left as a string because: + # * of backward compatibility + # * parameters are output in a YAML file + return datetime.strptime(self["moz_build_date"], "%Y%m%d%H%M%S") + + def file_url(self, path, pretty=False): + """ + Determine the VCS URL for viewing a file in the tree, suitable for + viewing by a human. + + :param str path: The path, relative to the root of the repository. + :param bool pretty: Whether to return a link to a formatted version of the + file, or the raw file version. + + :return str: The URL displaying the given path. + """ + if self["repository_type"] == "hg": + if path.startswith("comm/"): + path = path[len("comm/") :] + repo = self["comm_head_repository"] + rev = self["comm_head_rev"] + else: + repo = self["head_repository"] + rev = self["head_rev"] + endpoint = "file" if pretty else "raw-file" + return f"{repo}/{endpoint}/{rev}/{path}" + elif self["repository_type"] == "git": + # For getting the file URL for git repositories, we only support a Github HTTPS remote + repo = self["head_repository"] + if repo.startswith("https://github.com/"): + if repo.endswith("/"): + repo = repo[:-1] + + rev = self["head_rev"] + endpoint = "blob" if pretty else "raw" + return f"{repo}/{endpoint}/{rev}/{path}" + elif repo.startswith("git@github.com:"): + if repo.endswith(".git"): + repo = repo[:-4] + rev = self["head_rev"] + endpoint = "blob" if pretty else "raw" + return "{}/{}/{}/{}".format( + repo.replace("git@github.com:", "https://github.com/"), + endpoint, + rev, + path, + ) + else: + raise ParameterMismatch( + "Don't know how to determine file URL for non-github" + "repo: {}".format(repo) + ) + else: + raise RuntimeError( + 'Only the "git" and "hg" repository types are supported for using file_url()' + ) + + def __str__(self): + return f"Parameters(id={self.id}) (from {self.format_spec(self.spec)})" + + def __repr__(self): + return pformat(dict(self), indent=2) + + +def load_parameters_file( + spec, strict=True, overrides=None, trust_domain=None, repo_root=None +): + """ + Load parameters from a path, url, decision task-id or project. + + Examples: + task-id=fdtgsD5DQUmAQZEaGMvQ4Q + project=mozilla-central + """ + + if overrides is None: + overrides = {} + overrides["spec"] = spec + + if not spec: + return Parameters(strict=strict, repo_root=repo_root, **overrides) + + try: + # reading parameters from a local parameters.yml file + f = open(spec) + except OSError: + # fetching parameters.yml using task task-id, project or supplied url + task_id = None + if spec.startswith("task-id="): + task_id = spec.split("=")[1] + elif spec.startswith("project="): + if trust_domain is None: + raise ValueError( + "Can't specify parameters by project " + "if trust domain isn't supplied.", + ) + index = "{trust_domain}.v2.{project}.latest.taskgraph.decision".format( + trust_domain=trust_domain, + project=spec.split("=")[1], + ) + task_id = find_task_id(index) + + if task_id: + spec = get_artifact_url(task_id, "public/parameters.yml") + f = urlopen(spec) + + # Decompress gzipped parameters. + if f.info().get("Content-Encoding") == "gzip": + buf = BytesIO(f.read()) + f = gzip.GzipFile(fileobj=buf) + + if spec.endswith(".yml"): + kwargs = yaml.load_stream(f) + elif spec.endswith(".json"): + kwargs = json.load(f) + else: + raise TypeError(f"Parameters file `{spec}` is not JSON or YAML") + + kwargs.update(overrides) + return Parameters(strict=strict, repo_root=repo_root, **kwargs) + + +def parameters_loader(spec, strict=True, overrides=None): + def get_parameters(graph_config): + try: + repo_root = graph_config.vcs_root + except Exception: + repo_root = None + + parameters = load_parameters_file( + spec, + strict=strict, + overrides=overrides, + repo_root=repo_root, + trust_domain=graph_config["trust-domain"], + ) + parameters.check() + return parameters + + return get_parameters diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/run-task/fetch-content b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/fetch-content new file mode 100755 index 0000000000..0af923d01d --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/fetch-content @@ -0,0 +1,899 @@ +#!/usr/bin/python3 -u +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import argparse +import bz2 +import concurrent.futures +import contextlib +import datetime +import gzip +import hashlib +import json +import lzma +import multiprocessing +import os +import pathlib +import random +import re +import stat +import subprocess +import sys +import tarfile +import tempfile +import time +import urllib.parse +import urllib.request +import zipfile + +try: + import zstandard +except ImportError: + zstandard = None + +try: + import certifi +except ImportError: + certifi = None + + +CONCURRENCY = multiprocessing.cpu_count() + + +def log(msg): + print(msg, file=sys.stderr) + sys.stderr.flush() + + +class IntegrityError(Exception): + """Represents an integrity error when downloading a URL.""" + + +def ZstdCompressor(*args, **kwargs): + if not zstandard: + raise ValueError("zstandard Python package not available") + return zstandard.ZstdCompressor(*args, **kwargs) + + +def ZstdDecompressor(*args, **kwargs): + if not zstandard: + raise ValueError("zstandard Python package not available") + return zstandard.ZstdDecompressor(*args, **kwargs) + + +@contextlib.contextmanager +def rename_after_close(fname, *args, **kwargs): + """ + Context manager that opens a temporary file to use as a writer, + and closes the file on context exit, renaming it to the expected + file name in case of success, or removing it in case of failure. + + Takes the same options as open(), but must be used as a context + manager. + """ + path = pathlib.Path(fname) + tmp = path.with_name("%s.tmp" % path.name) + try: + with tmp.open(*args, **kwargs) as fh: + yield fh + except Exception: + tmp.unlink() + raise + else: + tmp.rename(fname) + + +# The following is copied from +# https://github.com/mozilla-releng/redo/blob/6d07678a014e0c525e54a860381a165d34db10ff/redo/__init__.py#L15-L85 +def retrier(attempts=5, sleeptime=10, max_sleeptime=300, sleepscale=1.5, jitter=1): + """ + A generator function that sleeps between retries, handles exponential + backoff and jitter. The action you are retrying is meant to run after + retrier yields. + + At each iteration, we sleep for sleeptime + random.randint(-jitter, jitter). + Afterwards sleeptime is multiplied by sleepscale for the next iteration. + + Args: + attempts (int): maximum number of times to try; defaults to 5 + sleeptime (float): how many seconds to sleep between tries; defaults to + 60s (one minute) + max_sleeptime (float): the longest we'll sleep, in seconds; defaults to + 300s (five minutes) + sleepscale (float): how much to multiply the sleep time by each + iteration; defaults to 1.5 + jitter (int): random jitter to introduce to sleep time each iteration. + the amount is chosen at random between [-jitter, +jitter] + defaults to 1 + + Yields: + None, a maximum of `attempts` number of times + + Example: + >>> n = 0 + >>> for _ in retrier(sleeptime=0, jitter=0): + ... if n == 3: + ... # We did the thing! + ... break + ... n += 1 + >>> n + 3 + + >>> n = 0 + >>> for _ in retrier(sleeptime=0, jitter=0): + ... if n == 6: + ... # We did the thing! + ... break + ... n += 1 + ... else: + ... print("max tries hit") + max tries hit + """ + jitter = jitter or 0 # py35 barfs on the next line if jitter is None + if jitter > sleeptime: + # To prevent negative sleep times + raise Exception( + "jitter ({}) must be less than sleep time ({})".format(jitter, sleeptime) + ) + + sleeptime_real = sleeptime + for _ in range(attempts): + log("attempt %i/%i" % (_ + 1, attempts)) + + yield sleeptime_real + + if jitter: + sleeptime_real = sleeptime + random.randint(-jitter, jitter) + # our jitter should scale along with the sleeptime + jitter = int(jitter * sleepscale) + else: + sleeptime_real = sleeptime + + sleeptime *= sleepscale + + if sleeptime_real > max_sleeptime: + sleeptime_real = max_sleeptime + + # Don't need to sleep the last time + if _ < attempts - 1: + log( + "sleeping for %.2fs (attempt %i/%i)" % (sleeptime_real, _ + 1, attempts) + ) + time.sleep(sleeptime_real) + + +def stream_download(url, sha256=None, size=None, headers=None): + """Download a URL to a generator, optionally with content verification. + + If ``sha256`` or ``size`` are defined, the downloaded URL will be + validated against those requirements and ``IntegrityError`` will be + raised if expectations do not match. + + Because verification cannot occur until the file is completely downloaded + it is recommended for consumers to not do anything meaningful with the + data if content verification is being used. To securely handle retrieved + content, it should be streamed to a file or memory and only operated + on after the generator is exhausted without raising. + """ + log("Downloading %s" % url) + headers = headers or [] + + h = hashlib.sha256() + length = 0 + + t0 = time.time() + req_headers = {} + for header in headers: + key, val = header.split(":") + req_headers[key.strip()] = val.strip() + + req = urllib.request.Request(url, None, req_headers) + with urllib.request.urlopen( + req, timeout=60, cafile=certifi.where() + ) if certifi else urllib.request.urlopen(req, timeout=60) as fh: + if not url.endswith(".gz") and fh.info().get("Content-Encoding") == "gzip": + fh = gzip.GzipFile(fileobj=fh) + + while True: + chunk = fh.read(65536) + if not chunk: + break + + h.update(chunk) + length += len(chunk) + + yield chunk + + duration = time.time() - t0 + digest = h.hexdigest() + + log( + "%s resolved to %d bytes with sha256 %s in %.3fs" + % (url, length, digest, duration) + ) + + if size: + if size == length: + log("Verified size of %s" % url) + else: + raise IntegrityError( + "size mismatch on %s: wanted %d; got %d" % (url, size, length) + ) + + if sha256: + if digest == sha256: + log("Verified sha256 integrity of %s" % url) + else: + raise IntegrityError( + "sha256 mismatch on %s: wanted %s; got %s" % (url, sha256, digest) + ) + + +def download_to_path(url, path, sha256=None, size=None, headers=None): + """Download a URL to a filesystem path, possibly with verification.""" + + # We download to a temporary file and rename at the end so there's + # no chance of the final file being partially written or containing + # bad data. + try: + path.unlink() + except FileNotFoundError: + pass + + for _ in retrier(attempts=5, sleeptime=60): + try: + log("Downloading %s to %s" % (url, path)) + + with rename_after_close(path, "wb") as fh: + for chunk in stream_download( + url, sha256=sha256, size=size, headers=headers + ): + fh.write(chunk) + + return + except IntegrityError: + raise + except Exception as e: + log("Download failed: {}".format(e)) + continue + + raise Exception("Download failed, no more retries!") + + +def download_to_memory(url, sha256=None, size=None): + """Download a URL to memory, possibly with verification.""" + + data = b"" + for _ in retrier(attempts=5, sleeptime=60): + try: + log("Downloading %s" % (url)) + + for chunk in stream_download(url, sha256=sha256, size=size): + data += chunk + + return data + except IntegrityError: + raise + except Exception as e: + log("Download failed: {}".format(e)) + continue + + raise Exception("Download failed, no more retries!") + + +def gpg_verify_path(path: pathlib.Path, public_key_data: bytes, signature_data: bytes): + """Verify that a filesystem path verifies using GPG. + + Takes a Path defining a file to verify. ``public_key_data`` contains + bytes with GPG public key data. ``signature_data`` contains a signed + GPG document to use with ``gpg --verify``. + """ + log("Validating GPG signature of %s" % path) + log("GPG key data:\n%s" % public_key_data.decode("ascii")) + + with tempfile.TemporaryDirectory() as td: + try: + # --batch since we're running unattended. + gpg_args = ["gpg", "--homedir", td, "--batch"] + + log("Importing GPG key...") + subprocess.run(gpg_args + ["--import"], input=public_key_data, check=True) + + log("Verifying GPG signature...") + subprocess.run( + gpg_args + ["--verify", "-", "%s" % path], + input=signature_data, + check=True, + ) + + log("GPG signature verified!") + finally: + # There is a race between the agent self-terminating and + # shutil.rmtree() from the temporary directory cleanup that can + # lead to exceptions. Kill the agent before cleanup to prevent this. + env = dict(os.environ) + env["GNUPGHOME"] = td + subprocess.run(["gpgconf", "--kill", "gpg-agent"], env=env) + + +def open_tar_stream(path: pathlib.Path): + """""" + if path.suffix == ".bz2": + return bz2.open(str(path), "rb") + elif path.suffix in (".gz", ".tgz") : + return gzip.open(str(path), "rb") + elif path.suffix == ".xz": + return lzma.open(str(path), "rb") + elif path.suffix == ".zst": + dctx = ZstdDecompressor() + return dctx.stream_reader(path.open("rb")) + elif path.suffix == ".tar": + return path.open("rb") + else: + raise ValueError("unknown archive format for tar file: %s" % path) + + +def archive_type(path: pathlib.Path): + """Attempt to identify a path as an extractable archive.""" + if path.suffixes[-2:-1] == [".tar"] or path.suffixes[-1:] == [".tgz"]: + return "tar" + elif path.suffix == ".zip": + return "zip" + else: + return None + + +def extract_archive(path, dest_dir, typ): + """Extract an archive to a destination directory.""" + + # Resolve paths to absolute variants. + path = path.resolve() + dest_dir = dest_dir.resolve() + + log("Extracting %s to %s" % (path, dest_dir)) + t0 = time.time() + + # We pipe input to the decompressor program so that we can apply + # custom decompressors that the program may not know about. + if typ == "tar": + ifh = open_tar_stream(path) + # On Windows, the tar program doesn't support things like symbolic + # links, while Windows actually support them. The tarfile module in + # python does. So use that. But since it's significantly slower than + # the tar program on Linux, only use tarfile on Windows (tarfile is + # also not much slower on Windows, presumably because of the + # notoriously bad I/O). + if sys.platform == "win32": + tar = tarfile.open(fileobj=ifh, mode="r|") + tar.extractall(str(dest_dir)) + args = [] + else: + args = ["tar", "xf", "-"] + pipe_stdin = True + elif typ == "zip": + # unzip from stdin has wonky behavior. We don't use a pipe for it. + ifh = open(os.devnull, "rb") + args = ["unzip", "-q", "-o", str(path)] + pipe_stdin = False + else: + raise ValueError("unknown archive format: %s" % path) + + if args: + with ifh, subprocess.Popen( + args, cwd=str(dest_dir), bufsize=0, stdin=subprocess.PIPE + ) as p: + while True: + if not pipe_stdin: + break + + chunk = ifh.read(131072) + if not chunk: + break + + p.stdin.write(chunk) + + if p.returncode: + raise Exception("%r exited %d" % (args, p.returncode)) + + log("%s extracted in %.3fs" % (path, time.time() - t0)) + + +def repack_archive( + orig: pathlib.Path, dest: pathlib.Path, strip_components=0, prefix="" +): + assert orig != dest + log("Repacking as %s" % dest) + orig_typ = archive_type(orig) + typ = archive_type(dest) + if not orig_typ: + raise Exception("Archive type not supported for %s" % orig.name) + if not typ: + raise Exception("Archive type not supported for %s" % dest.name) + + if dest.suffixes[-2:] != [".tar", ".zst"]: + raise Exception("Only producing .tar.zst archives is supported.") + + if strip_components or prefix: + + def filter(name): + if strip_components: + stripped = "/".join(name.split("/")[strip_components:]) + if not stripped: + raise Exception( + "Stripping %d components would remove files" % strip_components + ) + name = stripped + return prefix + name + + else: + filter = None + + with rename_after_close(dest, "wb") as fh: + ctx = ZstdCompressor() + if orig_typ == "zip": + assert typ == "tar" + zip = zipfile.ZipFile(orig) + # Convert the zip stream to a tar on the fly. + with ctx.stream_writer(fh) as compressor, tarfile.open( + fileobj=compressor, mode="w:" + ) as tar: + for zipinfo in zip.infolist(): + if zipinfo.is_dir(): + continue + tarinfo = tarfile.TarInfo() + filename = zipinfo.filename + tarinfo.name = filter(filename) if filter else filename + tarinfo.size = zipinfo.file_size + # Zip files don't have any knowledge of the timezone + # they were created in. Which is not really convenient to + # reliably convert to a timestamp. But we don't really + # care about accuracy, but rather about reproducibility, + # so we pick UTC. + time = datetime.datetime( + *zipinfo.date_time, tzinfo=datetime.timezone.utc + ) + tarinfo.mtime = time.timestamp() + # 0 is MS-DOS, 3 is UNIX. Only in the latter case do we + # get anything useful for the tar file mode. + if zipinfo.create_system == 3: + mode = zipinfo.external_attr >> 16 + else: + mode = 0o0644 + tarinfo.mode = stat.S_IMODE(mode) + if stat.S_ISLNK(mode): + tarinfo.type = tarfile.SYMTYPE + tarinfo.linkname = zip.read(filename).decode() + tar.addfile(tarinfo, zip.open(filename)) + elif stat.S_ISREG(mode) or stat.S_IFMT(mode) == 0: + tar.addfile(tarinfo, zip.open(filename)) + else: + raise Exception("Unsupported file mode %o" % stat.S_IFMT(mode)) + + elif orig_typ == "tar": + if typ == "zip": + raise Exception("Repacking a tar to zip is not supported") + assert typ == "tar" + + ifh = open_tar_stream(orig) + if filter: + # To apply the filter, we need to open the tar stream and + # tweak it. + origtar = tarfile.open(fileobj=ifh, mode="r|") + with ctx.stream_writer(fh) as compressor, tarfile.open( + fileobj=compressor, + mode="w:", + format=origtar.format, + ) as tar: + for tarinfo in origtar: + if tarinfo.isdir(): + continue + tarinfo.name = filter(tarinfo.name) + if "path" in tarinfo.pax_headers: + tarinfo.pax_headers["path"] = filter( + tarinfo.pax_headers["path"] + ) + if tarinfo.isfile(): + tar.addfile(tarinfo, origtar.extractfile(tarinfo)) + else: + tar.addfile(tarinfo) + else: + # We only change compression here. The tar stream is unchanged. + ctx.copy_stream(ifh, fh) + + +def fetch_and_extract(url, dest_dir, extract=True, sha256=None, size=None): + """Fetch a URL and extract it to a destination path. + + If the downloaded URL is an archive, it is extracted automatically + and the archive is deleted. Otherwise the file remains in place in + the destination directory. + """ + + basename = urllib.parse.urlparse(url).path.split("/")[-1] + dest_path = dest_dir / basename + + download_to_path(url, dest_path, sha256=sha256, size=size) + + if not extract: + return + + typ = archive_type(dest_path) + if typ: + extract_archive(dest_path, dest_dir, typ) + log("Removing %s" % dest_path) + dest_path.unlink() + + +def fetch_urls(downloads): + """Fetch URLs pairs to a pathlib.Path.""" + with concurrent.futures.ThreadPoolExecutor(CONCURRENCY) as e: + fs = [] + + for download in downloads: + fs.append(e.submit(fetch_and_extract, *download)) + + for f in fs: + f.result() + + +def _git_checkout_github_archive( + dest_path: pathlib.Path, repo: str, commit: str, prefix: str +): + "Use github archive generator to speed up github git repo cloning" + repo = repo.rstrip("/") + github_url = "{repo}/archive/{commit}.tar.gz".format(**locals()) + + with tempfile.TemporaryDirectory() as td: + temp_dir = pathlib.Path(td) + dl_dest = temp_dir / "archive.tar.gz" + download_to_path(github_url, dl_dest) + repack_archive(dl_dest, dest_path, strip_components=1, prefix=prefix + "/") + + +def _github_submodule_required(repo: str, commit: str): + "Use github API to check if submodules are used" + url = "{repo}/blob/{commit}/.gitmodules".format(**locals()) + try: + status_code = urllib.request.urlopen(url).getcode() + return status_code == 200 + except: + return False + + +def git_checkout_archive( + dest_path: pathlib.Path, + repo: str, + commit: str, + prefix=None, + ssh_key=None, + include_dot_git=False, +): + """Produce an archive of the files comprising a Git checkout.""" + dest_path.parent.mkdir(parents=True, exist_ok=True) + + if not prefix: + prefix = repo.rstrip("/").rsplit("/", 1)[-1] + + if dest_path.suffixes[-2:] != [".tar", ".zst"]: + raise Exception("Only producing .tar.zst archives is supported.") + + if repo.startswith("https://github.com/"): + if not include_dot_git and not _github_submodule_required(repo, commit): + log("Using github archive service to speedup archive creation") + # Always log sha1 info, either from commit or resolved from repo. + if re.match(r"^[a-fA-F0-9]{40}$", commit): + revision = commit + else: + ref_output = subprocess.check_output(["git", "ls-remote", repo, + 'refs/heads/' + commit]) + revision, _ = ref_output.decode().split(maxsplit=1) + log("Fetching revision {}".format(revision)) + return _git_checkout_github_archive(dest_path, repo, commit, prefix) + + with tempfile.TemporaryDirectory() as td: + temp_dir = pathlib.Path(td) + + git_dir = temp_dir / prefix + + # This could be faster with a shallow clone. However, Git requires a ref + # to initiate a clone. Since the commit-ish may not refer to a ref, we + # simply perform a full clone followed by a checkout. + print("cloning %s to %s" % (repo, git_dir)) + + env = os.environ.copy() + keypath = "" + if ssh_key: + taskcluster_secret_url = api( + os.environ.get("TASKCLUSTER_PROXY_URL"), + "secrets", + "v1", + "secret/{keypath}".format(keypath=ssh_key), + ) + taskcluster_secret = b"".join(stream_download(taskcluster_secret_url)) + taskcluster_secret = json.loads(taskcluster_secret) + sshkey = taskcluster_secret["secret"]["ssh_privkey"] + + keypath = temp_dir.joinpath("ssh-key") + keypath.write_text(sshkey) + keypath.chmod(0o600) + + env = { + "GIT_SSH_COMMAND": "ssh -o 'StrictHostKeyChecking no' -i {keypath}".format( + keypath=keypath + ) + } + + subprocess.run(["git", "clone", "-n", repo, str(git_dir)], check=True, env=env) + + # Always use a detached head so that git prints out what it checked out. + subprocess.run( + ["git", "checkout", "--detach", commit], cwd=str(git_dir), check=True + ) + + # When including the .git, we want --depth 1, but a direct clone would not + # necessarily be able to give us the right commit. + if include_dot_git: + initial_clone = git_dir.with_name(git_dir.name + ".orig") + git_dir.rename(initial_clone) + subprocess.run( + [ + "git", + "clone", + "file://" + str(initial_clone), + str(git_dir), + "--depth", + "1", + ], + check=True, + ) + subprocess.run( + ["git", "remote", "set-url", "origin", repo], + cwd=str(git_dir), + check=True, + ) + + # --depth 1 can induce more work on the server side, so only use it for + # submodule initialization when we want to keep the .git directory. + depth = ["--depth", "1"] if include_dot_git else [] + subprocess.run( + ["git", "submodule", "update", "--init"] + depth, + cwd=str(git_dir), + check=True, + ) + + if keypath: + os.remove(keypath) + + print("creating archive %s of commit %s" % (dest_path, commit)) + exclude_dot_git = [] if include_dot_git else ["--exclude=.git"] + proc = subprocess.Popen( + [ + "tar", + "cf", + "-", + ] + + exclude_dot_git + + [ + "-C", + str(temp_dir), + prefix, + ], + stdout=subprocess.PIPE, + ) + + with rename_after_close(dest_path, "wb") as out: + ctx = ZstdCompressor() + ctx.copy_stream(proc.stdout, out) + + proc.wait() + + +def command_git_checkout_archive(args): + dest = pathlib.Path(args.dest) + + try: + git_checkout_archive( + dest, + args.repo, + args.commit, + prefix=args.path_prefix, + ssh_key=args.ssh_key_secret, + include_dot_git=args.include_dot_git, + ) + except Exception: + try: + dest.unlink() + except FileNotFoundError: + pass + + raise + + +def command_static_url(args): + gpg_sig_url = args.gpg_sig_url + gpg_env_key = args.gpg_key_env + + if bool(gpg_sig_url) != bool(gpg_env_key): + print("--gpg-sig-url and --gpg-key-env must both be defined") + return 1 + + if gpg_sig_url: + gpg_signature = b"".join(stream_download(gpg_sig_url)) + gpg_key = os.environb[gpg_env_key.encode("ascii")] + + dest = pathlib.Path(args.dest) + dest.parent.mkdir(parents=True, exist_ok=True) + + basename = urllib.parse.urlparse(args.url).path.split("/")[-1] + if basename.endswith("".join(dest.suffixes)): + dl_dest = dest + else: + dl_dest = dest.parent / basename + + try: + download_to_path( + args.url, dl_dest, sha256=args.sha256, size=args.size, headers=args.headers + ) + + if gpg_sig_url: + gpg_verify_path(dl_dest, gpg_key, gpg_signature) + + if dl_dest != dest or args.strip_components or args.add_prefix: + repack_archive(dl_dest, dest, args.strip_components, args.add_prefix) + except Exception: + try: + dl_dest.unlink() + except FileNotFoundError: + pass + + raise + + if dl_dest != dest: + log("Removing %s" % dl_dest) + dl_dest.unlink() + + +def api(root_url, service, version, path): + # taskcluster-lib-urls is not available when this script runs, so + # simulate its behavior: + return "{root_url}/api/{service}/{version}/{path}".format( + root_url=root_url, service=service, version=version, path=path + ) + + +def get_hash(fetch, root_url): + path = "task/{task}/artifacts/{artifact}".format( + task=fetch["task"], artifact="public/chain-of-trust.json" + ) + url = api(root_url, "queue", "v1", path) + cot = json.loads(download_to_memory(url)) + return cot["artifacts"][fetch["artifact"]]["sha256"] + + +def command_task_artifacts(args): + start = time.monotonic() + fetches = json.loads(os.environ["MOZ_FETCHES"]) + downloads = [] + for fetch in fetches: + extdir = pathlib.Path(args.dest) + if "dest" in fetch: + # Note: normpath doesn't like pathlib.Path in python 3.5 + extdir = pathlib.Path(os.path.normpath(str(extdir.joinpath(fetch["dest"])))) + extdir.mkdir(parents=True, exist_ok=True) + root_url = os.environ["TASKCLUSTER_ROOT_URL"] + sha256 = None + if fetch.get("verify-hash"): + sha256 = get_hash(fetch, root_url) + if fetch["artifact"].startswith("public/"): + path = "task/{task}/artifacts/{artifact}".format( + task=fetch["task"], artifact=fetch["artifact"] + ) + url = api(root_url, "queue", "v1", path) + else: + url = ("{proxy_url}/api/queue/v1/task/{task}/artifacts/{artifact}").format( + proxy_url=os.environ["TASKCLUSTER_PROXY_URL"], + task=fetch["task"], + artifact=fetch["artifact"], + ) + downloads.append((url, extdir, fetch["extract"], sha256)) + + fetch_urls(downloads) + end = time.monotonic() + + perfherder_data = { + "framework": {"name": "build_metrics"}, + "suites": [ + { + "name": "fetch_content", + "value": end - start, + "lowerIsBetter": True, + "shouldAlert": False, + "subtests": [], + } + ], + } + print("PERFHERDER_DATA: {}".format(json.dumps(perfherder_data)), file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(title="sub commands") + + git_checkout = subparsers.add_parser( + "git-checkout-archive", + help="Obtain an archive of files from a Git repository checkout", + ) + git_checkout.set_defaults(func=command_git_checkout_archive) + git_checkout.add_argument( + "--path-prefix", help="Prefix for paths in produced archive" + ) + git_checkout.add_argument("repo", help="URL to Git repository to be cloned") + git_checkout.add_argument("commit", help="Git commit to check out") + git_checkout.add_argument("dest", help="Destination path of archive") + git_checkout.add_argument( + "--ssh-key-secret", help="The scope path of the ssh key to used for checkout" + ) + git_checkout.add_argument( + "--include-dot-git", action="store_true", help="Include the .git directory" + ) + + url = subparsers.add_parser("static-url", help="Download a static URL") + url.set_defaults(func=command_static_url) + url.add_argument("--sha256", required=True, help="SHA-256 of downloaded content") + url.add_argument( + "--size", required=True, type=int, help="Size of downloaded content, in bytes" + ) + url.add_argument( + "--gpg-sig-url", + help="URL containing signed GPG document validating " "URL to fetch", + ) + url.add_argument( + "--gpg-key-env", help="Environment variable containing GPG key to validate" + ) + url.add_argument( + "--strip-components", + type=int, + default=0, + help="Number of leading components to strip from file " + "names in the downloaded archive", + ) + url.add_argument( + "--add-prefix", + default="", + help="Prefix to add to file names in the downloaded " "archive", + ) + url.add_argument( + "-H", + "--header", + default=[], + action="append", + dest="headers", + help="Header to send as part of the request, can be passed " "multiple times", + ) + url.add_argument("url", help="URL to fetch") + url.add_argument("dest", help="Destination path") + + artifacts = subparsers.add_parser("task-artifacts", help="Fetch task artifacts") + artifacts.set_defaults(func=command_task_artifacts) + artifacts.add_argument( + "-d", + "--dest", + default=os.environ.get("MOZ_FETCHES_DIR"), + help="Destination directory which will contain all " + "artifacts (defaults to $MOZ_FETCHES_DIR)", + ) + + args = parser.parse_args() + + if not args.dest: + parser.error( + "no destination directory specified, either pass in --dest " + "or set $MOZ_FETCHES_DIR" + ) + + return args.func(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/run-task/hgrc b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/hgrc new file mode 100755 index 0000000000..f6a2f6643c --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/hgrc @@ -0,0 +1,33 @@ +# By default the progress bar starts after 3s and updates every 0.1s. We +# change this so it shows and updates every 1.0s. +# We also tell progress to assume a TTY is present so updates are printed +# even if there is no known TTY. +[progress] +delay = 1.0 +refresh = 1.0 +assume-tty = true + +[extensions] +share = +sparse = +robustcheckout = /usr/local/mercurial/robustcheckout.py + +[hostsecurity] +# When running a modern Python, Mercurial will default to TLS 1.1+. +# When running on a legacy Python, Mercurial will default to TLS 1.0+. +# There is no good reason we shouldn't be running a modern Python +# capable of speaking TLS 1.2. And the only Mercurial servers we care +# about should be running TLS 1.2. So make TLS 1.2 the minimum. +minimumprotocol = tls1.2 + +# Settings to make 1-click loaners more useful. +[extensions] +histedit = +rebase = + +[diff] +git = 1 +showfunc = 1 + +[pager] +pager = LESS=FRSXQ less diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/run-task/robustcheckout.py b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/robustcheckout.py new file mode 100644 index 0000000000..b5d2230211 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/robustcheckout.py @@ -0,0 +1,860 @@ +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +"""Robustly perform a checkout. + +This extension provides the ``hg robustcheckout`` command for +ensuring a working directory is updated to the specified revision +from a source repo using best practices to ensure optimal clone +times and storage efficiency. +""" + +from __future__ import absolute_import + +import contextlib +import json +import os +import random +import re +import socket +import ssl +import time + +from mercurial.i18n import _ +from mercurial.node import hex, nullid +from mercurial import ( + commands, + configitems, + error, + exchange, + extensions, + hg, + match as matchmod, + pycompat, + registrar, + scmutil, + urllibcompat, + util, + vfs, +) + +# Causes worker to purge caches on process exit and for task to retry. +EXIT_PURGE_CACHE = 72 + +testedwith = ( + b"4.5 4.6 4.7 4.8 4.9 5.0 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6.0 6.1 6.2 6.3 6.4" +) +minimumhgversion = b"4.5" + +cmdtable = {} +command = registrar.command(cmdtable) + +configtable = {} +configitem = registrar.configitem(configtable) + +configitem(b"robustcheckout", b"retryjittermin", default=configitems.dynamicdefault) +configitem(b"robustcheckout", b"retryjittermax", default=configitems.dynamicdefault) + + +def getsparse(): + from mercurial import sparse + + return sparse + + +def peerlookup(remote, v): + with remote.commandexecutor() as e: + return e.callcommand(b"lookup", {b"key": v}).result() + + +@command( + b"robustcheckout", + [ + (b"", b"upstream", b"", b"URL of upstream repo to clone from"), + (b"r", b"revision", b"", b"Revision to check out"), + (b"b", b"branch", b"", b"Branch to check out"), + (b"", b"purge", False, b"Whether to purge the working directory"), + (b"", b"sharebase", b"", b"Directory where shared repos should be placed"), + ( + b"", + b"networkattempts", + 3, + b"Maximum number of attempts for network " b"operations", + ), + (b"", b"sparseprofile", b"", b"Sparse checkout profile to use (path in repo)"), + ( + b"U", + b"noupdate", + False, + b"the clone will include an empty working directory\n" + b"(only a repository)", + ), + ], + b"[OPTION]... URL DEST", + norepo=True, +) +def robustcheckout( + ui, + url, + dest, + upstream=None, + revision=None, + branch=None, + purge=False, + sharebase=None, + networkattempts=None, + sparseprofile=None, + noupdate=False, +): + """Ensure a working copy has the specified revision checked out. + + Repository data is automatically pooled into the common directory + specified by ``--sharebase``, which is a required argument. It is required + because pooling storage prevents excessive cloning, which makes operations + complete faster. + + One of ``--revision`` or ``--branch`` must be specified. ``--revision`` + is preferred, as it is deterministic and there is no ambiguity as to which + revision will actually be checked out. + + If ``--upstream`` is used, the repo at that URL is used to perform the + initial clone instead of cloning from the repo where the desired revision + is located. + + ``--purge`` controls whether to removed untracked and ignored files from + the working directory. If used, the end state of the working directory + should only contain files explicitly under version control for the requested + revision. + + ``--sparseprofile`` can be used to specify a sparse checkout profile to use. + The sparse checkout profile corresponds to a file in the revision to be + checked out. If a previous sparse profile or config is present, it will be + replaced by this sparse profile. We choose not to "widen" the sparse config + so operations are as deterministic as possible. If an existing checkout + is present and it isn't using a sparse checkout, we error. This is to + prevent accidentally enabling sparse on a repository that may have + clients that aren't sparse aware. Sparse checkout support requires Mercurial + 4.3 or newer and the ``sparse`` extension must be enabled. + """ + if not revision and not branch: + raise error.Abort(b"must specify one of --revision or --branch") + + if revision and branch: + raise error.Abort(b"cannot specify both --revision and --branch") + + # Require revision to look like a SHA-1. + if revision: + if ( + len(revision) < 12 + or len(revision) > 40 + or not re.match(b"^[a-f0-9]+$", revision) + ): + raise error.Abort( + b"--revision must be a SHA-1 fragment 12-40 " b"characters long" + ) + + sharebase = sharebase or ui.config(b"share", b"pool") + if not sharebase: + raise error.Abort( + b"share base directory not defined; refusing to operate", + hint=b"define share.pool config option or pass --sharebase", + ) + + # Sparse profile support was added in Mercurial 4.3, where it was highly + # experimental. Because of the fragility of it, we only support sparse + # profiles on 4.3. When 4.4 is released, we'll need to opt in to sparse + # support. We /could/ silently fall back to non-sparse when not supported. + # However, given that sparse has performance implications, we want to fail + # fast if we can't satisfy the desired checkout request. + if sparseprofile: + try: + extensions.find(b"sparse") + except KeyError: + raise error.Abort( + b"sparse extension must be enabled to use " b"--sparseprofile" + ) + + ui.warn(b"(using Mercurial %s)\n" % util.version()) + + # worker.backgroundclose only makes things faster if running anti-virus, + # which our automation doesn't. Disable it. + ui.setconfig(b"worker", b"backgroundclose", False) + # Don't wait forever if the connection hangs + ui.setconfig(b"http", b"timeout", 600) + + # By default the progress bar starts after 3s and updates every 0.1s. We + # change this so it shows and updates every 1.0s. + # We also tell progress to assume a TTY is present so updates are printed + # even if there is no known TTY. + # We make the config change here instead of in a config file because + # otherwise we're at the whim of whatever configs are used in automation. + ui.setconfig(b"progress", b"delay", 1.0) + ui.setconfig(b"progress", b"refresh", 1.0) + ui.setconfig(b"progress", b"assume-tty", True) + + sharebase = os.path.realpath(sharebase) + + optimes = [] + behaviors = set() + start = time.time() + + try: + return _docheckout( + ui, + url, + dest, + upstream, + revision, + branch, + purge, + sharebase, + optimes, + behaviors, + networkattempts, + sparse_profile=sparseprofile, + noupdate=noupdate, + ) + finally: + overall = time.time() - start + + # We store the overall time multiple ways in order to help differentiate + # the various "flavors" of operations. + + # ``overall`` is always the total operation time. + optimes.append(("overall", overall)) + + def record_op(name): + # If special behaviors due to "corrupt" storage occur, we vary the + # name to convey that. + if "remove-store" in behaviors: + name += "_rmstore" + if "remove-wdir" in behaviors: + name += "_rmwdir" + + optimes.append((name, overall)) + + # We break out overall operations primarily by their network interaction + # We have variants within for working directory operations. + if "clone" in behaviors and "create-store" in behaviors: + record_op("overall_clone") + + if "sparse-update" in behaviors: + record_op("overall_clone_sparsecheckout") + else: + record_op("overall_clone_fullcheckout") + + elif "pull" in behaviors or "clone" in behaviors: + record_op("overall_pull") + + if "sparse-update" in behaviors: + record_op("overall_pull_sparsecheckout") + else: + record_op("overall_pull_fullcheckout") + + if "empty-wdir" in behaviors: + record_op("overall_pull_emptywdir") + else: + record_op("overall_pull_populatedwdir") + + else: + record_op("overall_nopull") + + if "sparse-update" in behaviors: + record_op("overall_nopull_sparsecheckout") + else: + record_op("overall_nopull_fullcheckout") + + if "empty-wdir" in behaviors: + record_op("overall_nopull_emptywdir") + else: + record_op("overall_nopull_populatedwdir") + + server_url = urllibcompat.urlreq.urlparse(url).netloc + + if "TASKCLUSTER_INSTANCE_TYPE" in os.environ: + perfherder = { + "framework": { + "name": "vcs", + }, + "suites": [], + } + for op, duration in optimes: + perfherder["suites"].append( + { + "name": op, + "value": duration, + "lowerIsBetter": True, + "shouldAlert": False, + "serverUrl": server_url.decode("utf-8"), + "hgVersion": util.version().decode("utf-8"), + "extraOptions": [os.environ["TASKCLUSTER_INSTANCE_TYPE"]], + "subtests": [], + } + ) + ui.write( + b"PERFHERDER_DATA: %s\n" + % pycompat.bytestr(json.dumps(perfherder, sort_keys=True)) + ) + + +def _docheckout( + ui, + url, + dest, + upstream, + revision, + branch, + purge, + sharebase, + optimes, + behaviors, + networkattemptlimit, + networkattempts=None, + sparse_profile=None, + noupdate=False, +): + if not networkattempts: + networkattempts = [1] + + def callself(): + return _docheckout( + ui, + url, + dest, + upstream, + revision, + branch, + purge, + sharebase, + optimes, + behaviors, + networkattemptlimit, + networkattempts=networkattempts, + sparse_profile=sparse_profile, + noupdate=noupdate, + ) + + @contextlib.contextmanager + def timeit(op, behavior): + behaviors.add(behavior) + errored = False + try: + start = time.time() + yield + except Exception: + errored = True + raise + finally: + elapsed = time.time() - start + + if errored: + op += "_errored" + + optimes.append((op, elapsed)) + + ui.write(b"ensuring %s@%s is available at %s\n" % (url, revision or branch, dest)) + + # We assume that we're the only process on the machine touching the + # repository paths that we were told to use. This means our recovery + # scenario when things aren't "right" is to just nuke things and start + # from scratch. This is easier to implement than verifying the state + # of the data and attempting recovery. And in some scenarios (such as + # potential repo corruption), it is probably faster, since verifying + # repos can take a while. + + destvfs = vfs.vfs(dest, audit=False, realpath=True) + + def deletesharedstore(path=None): + storepath = path or destvfs.read(b".hg/sharedpath").strip() + if storepath.endswith(b".hg"): + storepath = os.path.dirname(storepath) + + storevfs = vfs.vfs(storepath, audit=False) + storevfs.rmtree(forcibly=True) + + if destvfs.exists() and not destvfs.exists(b".hg"): + raise error.Abort(b"destination exists but no .hg directory") + + # Refuse to enable sparse checkouts on existing checkouts. The reasoning + # here is that another consumer of this repo may not be sparse aware. If we + # enabled sparse, we would lock them out. + if destvfs.exists() and sparse_profile and not destvfs.exists(b".hg/sparse"): + raise error.Abort( + b"cannot enable sparse profile on existing " b"non-sparse checkout", + hint=b"use a separate working directory to use sparse", + ) + + # And the other direction for symmetry. + if not sparse_profile and destvfs.exists(b".hg/sparse"): + raise error.Abort( + b"cannot use non-sparse checkout on existing sparse " b"checkout", + hint=b"use a separate working directory to use sparse", + ) + + # Require checkouts to be tied to shared storage because efficiency. + if destvfs.exists(b".hg") and not destvfs.exists(b".hg/sharedpath"): + ui.warn(b"(destination is not shared; deleting)\n") + with timeit("remove_unshared_dest", "remove-wdir"): + destvfs.rmtree(forcibly=True) + + # Verify the shared path exists and is using modern pooled storage. + if destvfs.exists(b".hg/sharedpath"): + storepath = destvfs.read(b".hg/sharedpath").strip() + + ui.write(b"(existing repository shared store: %s)\n" % storepath) + + if not os.path.exists(storepath): + ui.warn(b"(shared store does not exist; deleting destination)\n") + with timeit("removed_missing_shared_store", "remove-wdir"): + destvfs.rmtree(forcibly=True) + elif not re.search(b"[a-f0-9]{40}/\.hg$", storepath.replace(b"\\", b"/")): + ui.warn( + b"(shared store does not belong to pooled storage; " + b"deleting destination to improve efficiency)\n" + ) + with timeit("remove_unpooled_store", "remove-wdir"): + destvfs.rmtree(forcibly=True) + + if destvfs.isfileorlink(b".hg/wlock"): + ui.warn( + b"(dest has an active working directory lock; assuming it is " + b"left over from a previous process and that the destination " + b"is corrupt; deleting it just to be sure)\n" + ) + with timeit("remove_locked_wdir", "remove-wdir"): + destvfs.rmtree(forcibly=True) + + def handlerepoerror(e): + if pycompat.bytestr(e) == _(b"abandoned transaction found"): + ui.warn(b"(abandoned transaction found; trying to recover)\n") + repo = hg.repository(ui, dest) + if not repo.recover(): + ui.warn(b"(could not recover repo state; " b"deleting shared store)\n") + with timeit("remove_unrecovered_shared_store", "remove-store"): + deletesharedstore() + + ui.warn(b"(attempting checkout from beginning)\n") + return callself() + + raise + + # At this point we either have an existing working directory using + # shared, pooled storage or we have nothing. + + def handlenetworkfailure(): + if networkattempts[0] >= networkattemptlimit: + raise error.Abort( + b"reached maximum number of network attempts; " b"giving up\n" + ) + + ui.warn( + b"(retrying after network failure on attempt %d of %d)\n" + % (networkattempts[0], networkattemptlimit) + ) + + # Do a backoff on retries to mitigate the thundering herd + # problem. This is an exponential backoff with a multipler + # plus random jitter thrown in for good measure. + # With the default settings, backoffs will be: + # 1) 2.5 - 6.5 + # 2) 5.5 - 9.5 + # 3) 11.5 - 15.5 + backoff = (2 ** networkattempts[0] - 1) * 1.5 + jittermin = ui.configint(b"robustcheckout", b"retryjittermin", 1000) + jittermax = ui.configint(b"robustcheckout", b"retryjittermax", 5000) + backoff += float(random.randint(jittermin, jittermax)) / 1000.0 + ui.warn(b"(waiting %.2fs before retry)\n" % backoff) + time.sleep(backoff) + + networkattempts[0] += 1 + + def handlepullerror(e): + """Handle an exception raised during a pull. + + Returns True if caller should call ``callself()`` to retry. + """ + if isinstance(e, error.Abort): + if e.args[0] == _(b"repository is unrelated"): + ui.warn(b"(repository is unrelated; deleting)\n") + destvfs.rmtree(forcibly=True) + return True + elif e.args[0].startswith(_(b"stream ended unexpectedly")): + ui.warn(b"%s\n" % e.args[0]) + # Will raise if failure limit reached. + handlenetworkfailure() + return True + # TODO test this branch + elif isinstance(e, error.ResponseError): + if e.args[0].startswith(_(b"unexpected response from remote server:")): + ui.warn(b"(unexpected response from remote server; retrying)\n") + destvfs.rmtree(forcibly=True) + # Will raise if failure limit reached. + handlenetworkfailure() + return True + elif isinstance(e, ssl.SSLError): + # Assume all SSL errors are due to the network, as Mercurial + # should convert non-transport errors like cert validation failures + # to error.Abort. + ui.warn(b"ssl error: %s\n" % pycompat.bytestr(str(e))) + handlenetworkfailure() + return True + elif isinstance(e, urllibcompat.urlerr.httperror) and e.code >= 500: + ui.warn(b"http error: %s\n" % pycompat.bytestr(str(e.reason))) + handlenetworkfailure() + return True + elif isinstance(e, urllibcompat.urlerr.urlerror): + if isinstance(e.reason, socket.error): + ui.warn(b"socket error: %s\n" % pycompat.bytestr(str(e.reason))) + handlenetworkfailure() + return True + else: + ui.warn( + b"unhandled URLError; reason type: %s; value: %s\n" + % ( + pycompat.bytestr(e.reason.__class__.__name__), + pycompat.bytestr(str(e.reason)), + ) + ) + elif isinstance(e, socket.timeout): + ui.warn(b"socket timeout\n") + handlenetworkfailure() + return True + else: + ui.warn( + b"unhandled exception during network operation; type: %s; " + b"value: %s\n" + % (pycompat.bytestr(e.__class__.__name__), pycompat.bytestr(str(e))) + ) + + return False + + # Perform sanity checking of store. We may or may not know the path to the + # local store. It depends if we have an existing destvfs pointing to a + # share. To ensure we always find a local store, perform the same logic + # that Mercurial's pooled storage does to resolve the local store path. + cloneurl = upstream or url + + try: + clonepeer = hg.peer(ui, {}, cloneurl) + rootnode = peerlookup(clonepeer, b"0") + except error.RepoLookupError: + raise error.Abort(b"unable to resolve root revision from clone " b"source") + except ( + error.Abort, + ssl.SSLError, + urllibcompat.urlerr.urlerror, + socket.timeout, + ) as e: + if handlepullerror(e): + return callself() + raise + + if rootnode == nullid: + raise error.Abort(b"source repo appears to be empty") + + storepath = os.path.join(sharebase, hex(rootnode)) + storevfs = vfs.vfs(storepath, audit=False) + + if storevfs.isfileorlink(b".hg/store/lock"): + ui.warn( + b"(shared store has an active lock; assuming it is left " + b"over from a previous process and that the store is " + b"corrupt; deleting store and destination just to be " + b"sure)\n" + ) + if destvfs.exists(): + with timeit("remove_dest_active_lock", "remove-wdir"): + destvfs.rmtree(forcibly=True) + + with timeit("remove_shared_store_active_lock", "remove-store"): + storevfs.rmtree(forcibly=True) + + if storevfs.exists() and not storevfs.exists(b".hg/requires"): + ui.warn( + b"(shared store missing requires file; this is a really " + b"odd failure; deleting store and destination)\n" + ) + if destvfs.exists(): + with timeit("remove_dest_no_requires", "remove-wdir"): + destvfs.rmtree(forcibly=True) + + with timeit("remove_shared_store_no_requires", "remove-store"): + storevfs.rmtree(forcibly=True) + + if storevfs.exists(b".hg/requires"): + requires = set(storevfs.read(b".hg/requires").splitlines()) + # "share-safe" (enabled by default as of hg 6.1) moved most + # requirements to a new file, so we need to look there as well to avoid + # deleting and re-cloning each time + if b"share-safe" in requires: + requires |= set(storevfs.read(b".hg/store/requires").splitlines()) + # FUTURE when we require generaldelta, this is where we can check + # for that. + required = {b"dotencode", b"fncache"} + + missing = required - requires + if missing: + ui.warn( + b"(shared store missing requirements: %s; deleting " + b"store and destination to ensure optimal behavior)\n" + % b", ".join(sorted(missing)) + ) + if destvfs.exists(): + with timeit("remove_dest_missing_requires", "remove-wdir"): + destvfs.rmtree(forcibly=True) + + with timeit("remove_shared_store_missing_requires", "remove-store"): + storevfs.rmtree(forcibly=True) + + created = False + + if not destvfs.exists(): + # Ensure parent directories of destination exist. + # Mercurial 3.8 removed ensuredirs and made makedirs race safe. + if util.safehasattr(util, "ensuredirs"): + makedirs = util.ensuredirs + else: + makedirs = util.makedirs + + makedirs(os.path.dirname(destvfs.base), notindexed=True) + makedirs(sharebase, notindexed=True) + + if upstream: + ui.write(b"(cloning from upstream repo %s)\n" % upstream) + + if not storevfs.exists(): + behaviors.add(b"create-store") + + try: + with timeit("clone", "clone"): + shareopts = {b"pool": sharebase, b"mode": b"identity"} + res = hg.clone( + ui, + {}, + clonepeer, + dest=dest, + update=False, + shareopts=shareopts, + stream=True, + ) + except ( + error.Abort, + ssl.SSLError, + urllibcompat.urlerr.urlerror, + socket.timeout, + ) as e: + if handlepullerror(e): + return callself() + raise + except error.RepoError as e: + return handlerepoerror(e) + except error.RevlogError as e: + ui.warn(b"(repo corruption: %s; deleting shared store)\n" % e) + with timeit("remove_shared_store_revlogerror", "remote-store"): + deletesharedstore() + return callself() + + # TODO retry here. + if res is None: + raise error.Abort(b"clone failed") + + # Verify it is using shared pool storage. + if not destvfs.exists(b".hg/sharedpath"): + raise error.Abort(b"clone did not create a shared repo") + + created = True + + # The destination .hg directory should exist. Now make sure we have the + # wanted revision. + + repo = hg.repository(ui, dest) + + # We only pull if we are using symbolic names or the requested revision + # doesn't exist. + havewantedrev = False + + if revision: + try: + ctx = scmutil.revsingle(repo, revision) + except error.RepoLookupError: + ctx = None + + if ctx: + if not ctx.hex().startswith(revision): + raise error.Abort( + b"--revision argument is ambiguous", + hint=b"must be the first 12+ characters of a " b"SHA-1 fragment", + ) + + checkoutrevision = ctx.hex() + havewantedrev = True + + if not havewantedrev: + ui.write(b"(pulling to obtain %s)\n" % (revision or branch,)) + + remote = None + try: + remote = hg.peer(repo, {}, url) + pullrevs = [peerlookup(remote, revision or branch)] + checkoutrevision = hex(pullrevs[0]) + if branch: + ui.warn( + b"(remote resolved %s to %s; " + b"result is not deterministic)\n" % (branch, checkoutrevision) + ) + + if checkoutrevision in repo: + ui.warn(b"(revision already present locally; not pulling)\n") + else: + with timeit("pull", "pull"): + pullop = exchange.pull(repo, remote, heads=pullrevs) + if not pullop.rheads: + raise error.Abort(b"unable to pull requested revision") + except ( + error.Abort, + ssl.SSLError, + urllibcompat.urlerr.urlerror, + socket.timeout, + ) as e: + if handlepullerror(e): + return callself() + raise + except error.RepoError as e: + return handlerepoerror(e) + except error.RevlogError as e: + ui.warn(b"(repo corruption: %s; deleting shared store)\n" % e) + deletesharedstore() + return callself() + finally: + if remote: + remote.close() + + # Now we should have the wanted revision in the store. Perform + # working directory manipulation. + + # Avoid any working directory manipulations if `-U`/`--noupdate` was passed + if noupdate: + ui.write(b"(skipping update since `-U` was passed)\n") + return None + + # Purge if requested. We purge before update because this way we're + # guaranteed to not have conflicts on `hg update`. + if purge and not created: + ui.write(b"(purging working directory)\n") + purge = getattr(commands, "purge", None) + if not purge: + purge = extensions.find(b"purge").purge + + # Mercurial 4.3 doesn't purge files outside the sparse checkout. + # See https://bz.mercurial-scm.org/show_bug.cgi?id=5626. Force + # purging by monkeypatching the sparse matcher. + try: + old_sparse_fn = getattr(repo.dirstate, "_sparsematchfn", None) + if old_sparse_fn is not None: + repo.dirstate._sparsematchfn = lambda: matchmod.always() + + with timeit("purge", "purge"): + if purge( + ui, + repo, + all=True, + abort_on_err=True, + # The function expects all arguments to be + # defined. + **{"print": None, "print0": None, "dirs": None, "files": None} + ): + raise error.Abort(b"error purging") + finally: + if old_sparse_fn is not None: + repo.dirstate._sparsematchfn = old_sparse_fn + + # Update the working directory. + + if repo[b"."].node() == nullid: + behaviors.add("empty-wdir") + else: + behaviors.add("populated-wdir") + + if sparse_profile: + sparsemod = getsparse() + + # By default, Mercurial will ignore unknown sparse profiles. This could + # lead to a full checkout. Be more strict. + try: + repo.filectx(sparse_profile, changeid=checkoutrevision).data() + except error.ManifestLookupError: + raise error.Abort( + b"sparse profile %s does not exist at revision " + b"%s" % (sparse_profile, checkoutrevision) + ) + + old_config = sparsemod.parseconfig( + repo.ui, repo.vfs.tryread(b"sparse"), b"sparse" + ) + + old_includes, old_excludes, old_profiles = old_config + + if old_profiles == {sparse_profile} and not old_includes and not old_excludes: + ui.write( + b"(sparse profile %s already set; no need to update " + b"sparse config)\n" % sparse_profile + ) + else: + if old_includes or old_excludes or old_profiles: + ui.write( + b"(replacing existing sparse config with profile " + b"%s)\n" % sparse_profile + ) + else: + ui.write(b"(setting sparse config to profile %s)\n" % sparse_profile) + + # If doing an incremental update, this will perform two updates: + # one to change the sparse profile and another to update to the new + # revision. This is not desired. But there's not a good API in + # Mercurial to do this as one operation. + # TRACKING hg64 - Mercurial 6.4 and later require call to + # dirstate.changing_parents(repo) + def parentchange(repo): + if util.safehasattr(repo.dirstate, "changing_parents"): + return repo.dirstate.changing_parents(repo) + return repo.dirstate.parentchange() + + with repo.wlock(), parentchange(repo), timeit( + "sparse_update_config", "sparse-update-config" + ): + # pylint --py3k: W1636 + fcounts = list( + map( + len, + sparsemod._updateconfigandrefreshwdir( + repo, [], [], [sparse_profile], force=True + ), + ) + ) + + repo.ui.status( + b"%d files added, %d files dropped, " + b"%d files conflicting\n" % tuple(fcounts) + ) + + ui.write(b"(sparse refresh complete)\n") + + op = "update_sparse" if sparse_profile else "update" + behavior = "update-sparse" if sparse_profile else "update" + + with timeit(op, behavior): + if commands.update(ui, repo, rev=checkoutrevision, clean=True): + raise error.Abort(b"error updating") + + ui.write(b"updated to %s\n" % checkoutrevision) + + return None + + +def extsetup(ui): + # Ensure required extensions are loaded. + for ext in (b"purge", b"share"): + try: + extensions.find(ext) + except KeyError: + extensions.load(ui, ext, None) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/run-task/run-task b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/run-task new file mode 100755 index 0000000000..267b5283ea --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/run-task/run-task @@ -0,0 +1,1348 @@ +#!/usr/bin/python3 -u +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Run a task after performing common actions. + +This script is meant to be the "driver" for TaskCluster based tasks. +It receives some common arguments to control the run-time environment. + +It performs actions as requested from the arguments. Then it executes +the requested process and prints its output, prefixing it with the +current time to improve log usefulness. +""" + +import sys + +if sys.version_info[0:2] < (3, 5): + print("run-task requires Python 3.5+") + sys.exit(1) + +import argparse +import datetime +import errno +import io +import json +import os +import platform +import re +import shutil +import signal +import socket +import stat +import subprocess +import time +import urllib.error +import urllib.request +from pathlib import Path +from threading import Thread +from typing import Optional + +SECRET_BASEURL_TPL = "http://taskcluster/secrets/v1/secret/{}" + +GITHUB_SSH_FINGERPRINT = ( + b"github.com ssh-ed25519 " + b"AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl\n" + b"github.com ecdsa-sha2-nistp256 " + b"AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB" + b"9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=\n" + b"github.com ssh-rsa " + b"AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY" + b"4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDP" + b"gVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyR" + b"kQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWO" + b"WRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZ" + b"yaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+" + b"2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=\n" +) + + +CACHE_UID_GID_MISMATCH = """ +There is a UID/GID mismatch on the cache. This likely means: + +a) different tasks are running as a different user/group +b) different Docker images have different UID/GID for the same user/group + +Our cache policy is that the UID/GID for ALL tasks must be consistent +for the lifetime of the cache. This eliminates permissions problems due +to file/directory user/group ownership. + +To make this error go away, ensure that all Docker images are use +a consistent UID/GID and that all tasks using this cache are running as +the same user/group. +""" + + +NON_EMPTY_VOLUME = """ +error: volume %s is not empty + +Our Docker image policy requires volumes to be empty. + +The volume was likely populated as part of building the Docker image. +Change the Dockerfile and anything run from it to not create files in +any VOLUME. + +A lesser possibility is that you stumbled upon a TaskCluster platform bug +where it fails to use new volumes for tasks. +""" + + +FETCH_CONTENT_NOT_FOUND = """ +error: fetch-content script not found + +The script at `taskcluster/scripts/misc/fetch-content` could not be +detected in the current environment. +""" + +# The exit code to use when caches should be purged and the task retried. +# This is EX_OSFILE (from sysexits.h): +# Some system file does not exist, cannot be opened, or has some +# sort of error (e.g., syntax error). +EXIT_PURGE_CACHE = 72 + + +IS_MACOSX = sys.platform == "darwin" +IS_POSIX = os.name == "posix" +IS_WINDOWS = os.name == "nt" + +# Both mercurial and git use sha1 as revision idenfiers. Luckily, both define +# the same value as the null revision. +# +# https://github.com/git/git/blob/dc04167d378fb29d30e1647ff6ff51dd182bc9a3/t/oid-info/hash-info#L7 +# https://www.mercurial-scm.org/repo/hg-stable/file/82efc31bd152/mercurial/node.py#l30 +NULL_REVISION = "0000000000000000000000000000000000000000" + + +def print_line(prefix, m): + now = datetime.datetime.utcnow().isoformat().encode("utf-8") + # slice microseconds to 3 decimals. + now = now[:-3] if now[-7:-6] == b"." else now + sys.stdout.buffer.write(b"[%s %sZ] %s" % (prefix, now, m)) + sys.stdout.buffer.flush() + + +def _call_windows_retry(func, args=(), retry_max=5, retry_delay=0.5): + """ + It's possible to see spurious errors on Windows due to various things + keeping a handle to the directory open (explorer, virus scanners, etc) + So we try a few times if it fails with a known error. + retry_delay is multiplied by the number of failed attempts to increase + the likelihood of success in subsequent attempts. + """ + retry_count = 0 + while True: + try: + func(*args) + except OSError as e: + # Error codes are defined in: + # https://docs.python.org/3/library/errno.html#module-errno + if e.errno not in (errno.EACCES, errno.ENOTEMPTY, errno.ENOENT): + raise + + if retry_count == retry_max: + raise + + retry_count += 1 + + print( + '%s() failed for "%s". Reason: %s (%s). Retrying...' + % (func.__name__, args, e.strerror, e.errno) + ) + time.sleep(retry_count * retry_delay) + else: + # If no exception has been thrown it should be done + break + + +def remove(path): + """Removes the specified file, link, or directory tree. + + This is a replacement for shutil.rmtree that works better under + windows. It does the following things: + + - check path access for the current user before trying to remove + - retry operations on some known errors due to various things keeping + a handle on file paths - like explorer, virus scanners, etc. The + known errors are errno.EACCES and errno.ENOTEMPTY, and it will + retry up to 5 five times with a delay of (failed_attempts * 0.5) seconds + between each attempt. + + Note that no error will be raised if the given path does not exists. + + :param path: path to be removed + """ + + def _update_permissions(path): + """Sets specified pemissions depending on filetype""" + if os.path.islink(path): + # Path is a symlink which we don't have to modify + # because it should already have all the needed permissions + return + + stats = os.stat(path) + + if os.path.isfile(path): + mode = stats.st_mode | stat.S_IWUSR + elif os.path.isdir(path): + mode = stats.st_mode | stat.S_IWUSR | stat.S_IXUSR + else: + # Not supported type + return + + _call_windows_retry(os.chmod, (path, mode)) + + if not os.path.lexists(path): + print_line(b"remove", b"WARNING: %s does not exists!\n" % path.encode("utf-8")) + return + + """ + On Windows, adds '\\\\?\\' to paths which match ^[A-Za-z]:\\.* to access + files or directories that exceed MAX_PATH(260) limitation or that ends + with a period. + """ + if ( + sys.platform in ("win32", "cygwin") + and len(path) >= 3 + and path[1] == ":" + and path[2] == "\\" + ): + path = "\\\\?\\%s" % path + + if os.path.isfile(path) or os.path.islink(path): + # Verify the file or link is read/write for the current user + _update_permissions(path) + _call_windows_retry(os.remove, (path,)) + + elif os.path.isdir(path): + # Verify the directory is read/write/execute for the current user + _update_permissions(path) + + # We're ensuring that every nested item has writable permission. + for root, dirs, files in os.walk(path): + for entry in dirs + files: + _update_permissions(os.path.join(root, entry)) + _call_windows_retry(shutil.rmtree, (path,)) + + +def run_required_command(prefix, args, *, extra_env=None, cwd=None): + res = run_command(prefix, args, extra_env=extra_env, cwd=cwd) + if res: + sys.exit(res) + + +def retry_required_command(prefix, args, *, extra_env=None, cwd=None, retries=2): + backoff = 1 + while True: + res = run_command(prefix, args, extra_env=extra_env, cwd=cwd) + if not res: + return + if not retries: + sys.exit(res) + retries -= 1 + backoff *= 2 + time.sleep(backoff) + + +def run_command(prefix, args, *, extra_env=None, cwd=None): + """Runs a process and prefixes its output with the time. + + Returns the process exit code. + """ + print_line(prefix, b"executing %r\n" % args) + + env = dict(os.environ) + env.update(extra_env or {}) + + # Note: TaskCluster's stdin is a TTY. This attribute is lost + # when we pass sys.stdin to the invoked process. If we cared + # to preserve stdin as a TTY, we could make this work. But until + # someone needs it, don't bother. + + # We want stdout to be bytes on Python 3. That means we can't use + # universal_newlines=True (because it implies text mode). But + # p.stdout.readline() won't work for bytes text streams. So, on Python 3, + # we manually install a latin1 stream wrapper. This allows us to readline() + # and preserves bytes, without losing any data. + + p = subprocess.Popen( + args, + # Disable buffering because we want to receive output + # as it is generated so timestamps in logs are + # accurate. + bufsize=0, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=sys.stdin.fileno(), + cwd=cwd, + env=env, + ) + + stdout = io.TextIOWrapper(p.stdout, encoding="latin1") + + while True: + data = stdout.readline().encode("latin1") + + if data == b"": + break + + print_line(prefix, data) + + return p.wait() + + +def get_posix_user_group(user, group): + import grp + import pwd + + try: + user_record = pwd.getpwnam(user) + except KeyError: + print("could not find user %s; specify a valid user with --user" % user) + sys.exit(1) + + try: + group_record = grp.getgrnam(group) + except KeyError: + print("could not find group %s; specify a valid group with --group" % group) + sys.exit(1) + + # Most tasks use worker:worker. We require they have a specific numeric ID + # because otherwise it is too easy for files written to caches to have + # mismatched numeric IDs, which results in permissions errors. + if user_record.pw_name == "worker" and user_record.pw_uid != 1000: + print("user `worker` must have uid=1000; got %d" % user_record.pw_uid) + sys.exit(1) + + if group_record.gr_name == "worker" and group_record.gr_gid != 1000: + print("group `worker` must have gid=1000; got %d" % group_record.gr_gid) + sys.exit(1) + + # Find all groups to which this user is a member. + gids = [g.gr_gid for g in grp.getgrall() if group in g.gr_mem] + + return user_record, group_record, gids + + +def write_audit_entry(path, msg): + now = datetime.datetime.utcnow().isoformat().encode("utf-8") + with open(path, "ab") as fh: + fh.write(b"[%sZ %s] %s\n" % (now, os.environb.get(b"TASK_ID", b"UNKNOWN"), msg)) + + +WANTED_DIR_MODE = stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR + + +def set_dir_permissions(path, uid, gid): + st = os.lstat(path) + + if st.st_uid != uid or st.st_gid != gid: + os.chown(path, uid, gid) + + # Also make sure dirs are writable in case we need to delete + # them. + if st.st_mode & WANTED_DIR_MODE != WANTED_DIR_MODE: + os.chmod(path, st.st_mode | WANTED_DIR_MODE) + + +def chown_recursive(path, user, group, uid, gid): + print_line( + b"chown", + b"recursively changing ownership of %s to %s:%s\n" + % (path.encode("utf-8"), user.encode("utf-8"), group.encode("utf-8")), + ) + + set_dir_permissions(path, uid, gid) + + for root, dirs, files in os.walk(path): + for d in dirs: + set_dir_permissions(os.path.join(root, d), uid, gid) + + for f in files: + # File may be a symlink that points to nowhere. In which case + # os.chown() would fail because it attempts to follow the + # symlink. We only care about directory entries, not what + # they point to. So setting the owner of the symlink should + # be sufficient. + os.lchown(os.path.join(root, f), uid, gid) + + +def configure_cache_posix(cache, user, group, untrusted_caches, running_as_root): + """Configure a cache path on POSIX platforms. + + For each cache, we write out a special file denoting attributes and + capabilities of run-task and the task being executed. These attributes + are used by subsequent run-task invocations to validate that use of + the cache is acceptable. + + We /could/ blow away the cache data on requirements mismatch. + While this would be convenient, this could result in "competing" tasks + effectively undoing the other's work. This would slow down task + execution in aggregate. Without monitoring for this, people may not notice + the problem and tasks would be slower than they could be. We follow the + principle of "fail fast" to ensure optimal task execution. + + We also write an audit log of who used the caches. This log is printed + during failures to help aid debugging. + """ + + our_requirements = { + # Include a version string that we can bump whenever to trigger + # fresh caches. The actual value is not relevant and doesn't need + # to follow any explicit order. Since taskgraph bakes this file's + # hash into cache names, any change to this file/version is sufficient + # to force the use of a new cache. + b"version=1", + # Include the UID and GID the task will run as to ensure that tasks + # with different UID and GID don't share the same cache. + b"uid=%d" % user.pw_uid, + b"gid=%d" % group.gr_gid, + } + + requires_path = os.path.join(cache, ".cacherequires") + audit_path = os.path.join(cache, ".cachelog") + + # The cache is empty. Configure it. + if not os.listdir(cache): + print_line( + b"cache", + b"cache %s is empty; writing requirements: " + b"%s\n" % (cache.encode("utf-8"), b" ".join(sorted(our_requirements))), + ) + + # We write a requirements file so future invocations know what the + # requirements are. + with open(requires_path, "wb") as fh: + fh.write(b"\n".join(sorted(our_requirements))) + + # And make it read-only as a precaution against deletion. + os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) + + write_audit_entry( + audit_path, + b"created; requirements: %s" % b", ".join(sorted(our_requirements)), + ) + + set_dir_permissions(cache, user.pw_uid, group.gr_gid) + return + + # The cache has content and we have a requirements file. Validate + # requirements alignment. + if os.path.exists(requires_path): + with open(requires_path, "rb") as fh: + wanted_requirements = set(fh.read().splitlines()) + + print_line( + b"cache", + b"cache %s exists; requirements: %s\n" + % (cache.encode("utf-8"), b" ".join(sorted(wanted_requirements))), + ) + + missing = wanted_requirements - our_requirements + + # Allow requirements mismatch for uid/gid if and only if caches + # are untrusted. This allows cache behavior on Try to be + # reasonable. Otherwise, random tasks could "poison" cache + # usability by introducing uid/gid mismatches. For untrusted + # environments like Try, this is a perfectly reasonable thing to + # allow. + if ( + missing + and untrusted_caches + and running_as_root + and all(s.startswith((b"uid=", b"gid=")) for s in missing) + ): + print_line( + b"cache", + b"cache %s uid/gid mismatch; this is acceptable " + b"because caches for this task are untrusted; " + b"changing ownership to facilitate cache use\n" % cache.encode("utf-8"), + ) + chown_recursive( + cache, user.pw_name, group.gr_name, user.pw_uid, group.gr_gid + ) + + # And write out the updated reality. + with open(requires_path, "wb") as fh: + fh.write(b"\n".join(sorted(our_requirements))) + + write_audit_entry( + audit_path, + b"chown; requirements: %s" % b", ".join(sorted(our_requirements)), + ) + + elif missing: + print( + "error: requirements for populated cache %s differ from " + "this task" % cache + ) + print( + "cache requirements: %s" + % " ".join(sorted(s.decode("utf-8") for s in wanted_requirements)) + ) + print( + "our requirements: %s" + % " ".join(sorted(s.decode("utf-8") for s in our_requirements)) + ) + if any(s.startswith((b"uid=", b"gid=")) for s in missing): + print(CACHE_UID_GID_MISMATCH) + + write_audit_entry( + audit_path, + b"requirements mismatch; wanted: %s" + % b", ".join(sorted(our_requirements)), + ) + + print("") + print("audit log:") + with open(audit_path, "r") as fh: + print(fh.read()) + + return True + else: + write_audit_entry(audit_path, b"used") + + # We don't need to adjust permissions here because the cache is + # associated with a uid/gid and the first task should have set + # a proper owner/group. + + return + + # The cache has content and no requirements file. This shouldn't + # happen because run-task should be the first thing that touches a + # cache. + print( + "error: cache %s is not empty and is missing a " + ".cacherequires file; the cache names for this task are " + "likely mis-configured or TASKCLUSTER_CACHES is not set " + "properly" % cache + ) + + write_audit_entry(audit_path, b"missing .cacherequires") + return True + + +def configure_volume_posix(volume, user, group, running_as_root): + # The only time we should see files in the volume is if the Docker + # image build put files there. + # + # For the sake of simplicity, our policy is that volumes should be + # empty. This also has the advantage that an empty volume looks + # a lot like an empty cache. Tasks can rely on caches being + # swapped in and out on any volume without any noticeable change + # of behavior. + volume_files = os.listdir(volume) + if volume_files: + print(NON_EMPTY_VOLUME % volume) + print("entries in root directory: %s" % " ".join(sorted(volume_files))) + sys.exit(1) + + # The volume is almost certainly owned by root:root. Chown it so it + # is writable. + + if running_as_root: + print_line( + b"volume", + b"changing ownership of volume %s " + b"to %d:%d\n" % (volume.encode("utf-8"), user.pw_uid, group.gr_gid), + ) + set_dir_permissions(volume, user.pw_uid, group.gr_gid) + + +def _clean_git_checkout(destination_path): + # Delete untracked files (i.e. build products) + print_line(b"vcs", b"cleaning git checkout...\n") + args = [ + "git", + "clean", + # Two -f`s causes subdirectories with `.git` + # directories to be cleaned as well. + "-nxdff", + ] + print_line(b"vcs", b"executing %r\n" % args) + p = subprocess.Popen( + args, + # Disable buffering because we want to receive output + # as it is generated so timestamps in logs are + # accurate. + bufsize=0, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=sys.stdin.fileno(), + cwd=destination_path, + env=os.environ, + ) + stdout = io.TextIOWrapper(p.stdout, encoding="latin1") + ret = p.wait() + if ret: + sys.exit(ret) + data = stdout.read() + prefix = "Would remove " + filenames = [ + os.path.join(destination_path, line[len(prefix) :]) + for line in data.splitlines() + ] + print_line(b"vcs", b"removing %r\n" % filenames) + for filename in filenames: + remove(filename) + print_line(b"vcs", b"successfully cleaned git checkout!\n") + + +def git_checkout( + destination_path: str, + head_repo: str, + base_repo: Optional[str], + base_ref: Optional[str], + base_rev: Optional[str], + ref: Optional[str], + commit: Optional[str], + ssh_key_file: Optional[Path], + ssh_known_hosts_file: Optional[Path], +): + env = { + # abort if transfer speed is lower than 1kB/s for 1 minute + "GIT_HTTP_LOW_SPEED_LIMIT": "1024", + "GIT_HTTP_LOW_SPEED_TIME": "60", + "PYTHONUNBUFFERED": "1", + } + + if ssh_key_file and ssh_known_hosts_file: + if not ssh_key_file.exists(): + raise RuntimeError("Can't find specified ssh_key file.") + if not ssh_known_hosts_file.exists(): + raise RuntimeError("Can't find specified known_hosts file.") + env["GIT_SSH_COMMAND"] = " ".join( + [ + "ssh", + "-oIdentityFile={}".format(ssh_key_file.as_posix()), + "-oStrictHostKeyChecking=yes", + "-oUserKnownHostsFile={}".format(ssh_known_hosts_file.as_posix()), + ] + ) + elif ssh_key_file or ssh_known_hosts_file: + raise RuntimeError( + "Must specify both ssh_key_file and ssh_known_hosts_file, if either are specified", + ) + + if not os.path.exists(destination_path): + # Repository doesn't already exist, needs to be cloned + args = [ + "git", + "clone", + base_repo if base_repo else head_repo, + destination_path, + ] + + retry_required_command(b"vcs", args, extra_env=env) + + if base_ref: + args = ["git", "fetch", "origin", base_ref] + + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + # Create local branch so that taskgraph is able to compute differences + # between the head branch and the base one, if needed + args = ["git", "checkout", base_ref] + + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + # When commits are force-pushed (like on a testing branch), base_rev doesn't + # exist on base_ref. Fetching it allows taskgraph to compute differences + # between the previous state before the force-push and the current state. + # + # Unlike base_ref just above, there is no need to checkout the revision: + # it's immediately available after the fetch. + if base_rev and base_rev != NULL_REVISION: + args = ["git", "fetch", "origin", base_rev] + + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + # If a ref was provided, it might be tag, so we need to make sure we fetch + # those. This is explicitly only done when base and head repo match, + # because it is the only scenario where tags could be present. (PRs, for + # example, always include an explicit rev.) Failure to do this could result + # in not having a tag, or worse: having an outdated version of one. + # `--force` is needed to be able to update an existing tag. + if ref and base_repo == head_repo: + args = [ + "git", + "fetch", + "--tags", + "--force", + base_repo, + ref, + ] + + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + # If a ref isn't provided, we fetch all refs from head_repo, which may be slow + args = [ + "git", + "fetch", + "--no-tags", + head_repo, + ref if ref else "+refs/heads/*:refs/remotes/work/*", + ] + + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + args = [ + "git", + "checkout", + "-f", + ] + + if ref: + args.extend(["-B", ref]) + + # `git fetch` set `FETCH_HEAD` reference to the last commit of the desired branch + args.append(commit if commit else "FETCH_HEAD") + + run_required_command(b"vcs", args, cwd=destination_path) + + if os.path.exists(os.path.join(destination_path, ".gitmodules")): + args = [ + "git", + "submodule", + "init", + ] + + run_required_command(b"vcs", args, cwd=destination_path) + + args = [ + "git", + "submodule", + "update", + "--force", # Overrides any potential local changes + ] + + run_required_command(b"vcs", args, cwd=destination_path) + + _clean_git_checkout(destination_path) + + args = ["git", "rev-parse", "--verify", "HEAD"] + + commit_hash = subprocess.check_output( + args, cwd=destination_path, universal_newlines=True + ).strip() + assert re.match("^[a-f0-9]{40}$", commit_hash) + + if head_repo.startswith("https://github.com"): + if head_repo.endswith("/"): + head_repo = head_repo[:-1] + + tinderbox_link = "{}/commit/{}".format(head_repo, commit_hash) + repo_name = head_repo.split("/")[-1] + else: + tinderbox_link = head_repo + repo_name = head_repo + + msg = ( + "TinderboxPrint:<a href='{link}' " + "title='Built from {name} commit {commit_hash}'>" + "{commit_hash}</a>\n".format( + commit_hash=commit_hash, link=tinderbox_link, name=repo_name + ) + ) + + print_line(b"vcs", msg.encode("utf-8")) + + return commit_hash + + +def fetch_ssh_secret(secret_name): + """Retrieves the private ssh key, and returns it as a StringIO object""" + secret_url = SECRET_BASEURL_TPL.format(secret_name) + try: + print_line( + b"vcs", + b"fetching secret %s from %s\n" + % (secret_name.encode("utf-8"), secret_url.encode("utf-8")), + ) + res = urllib.request.urlopen(secret_url, timeout=10) + secret = res.read() + try: + secret = json.loads(secret.decode("utf-8")) + except ValueError: + print_line(b"vcs", b"invalid JSON in secret") + sys.exit(1) + except (urllib.error.URLError, socket.timeout): + print_line(b"vcs", b"Unable to retrieve ssh secret. aborting...") + sys.exit(1) + + return secret["secret"]["ssh_privkey"] + + +def hg_checkout( + destination_path: str, + head_repo: str, + base_repo: Optional[str], + store_path: str, + sparse_profile: Optional[str], + branch: Optional[str], + revision: Optional[str], +): + if IS_MACOSX: + hg_bin = "/tools/python27-mercurial/bin/hg" + elif IS_POSIX: + hg_bin = "hg" + elif IS_WINDOWS: + # This is where OCC installs it in the AMIs. + hg_bin = r"C:\Program Files\Mercurial\hg.exe" + if not os.path.exists(hg_bin): + print("could not find Mercurial executable: %s" % hg_bin) + sys.exit(1) + else: + raise RuntimeError("Must be running on mac, posix or windows") + + args = [ + hg_bin, + "robustcheckout", + "--sharebase", + store_path, + "--purge", + ] + + if base_repo: + args.extend(["--upstream", base_repo]) + if sparse_profile: + args.extend(["--sparseprofile", sparse_profile]) + + # Specify method to checkout a revision. This defaults to revisions as + # SHA-1 strings, but also supports symbolic revisions like `tip` via the + # branch flag. + args.extend( + [ + "--branch" if branch else "--revision", + branch or revision, + head_repo, + destination_path, + ] + ) + + run_required_command(b"vcs", args, extra_env={"PYTHONUNBUFFERED": "1"}) + + # Update the current revision hash and ensure that it is well formed. + revision = subprocess.check_output( + [hg_bin, "log", "--rev", ".", "--template", "{node}"], + cwd=destination_path, + # Triggers text mode on Python 3. + universal_newlines=True, + ) + + assert re.match("^[a-f0-9]{40}$", revision) + + msg = ( + "TinderboxPrint:<a href={head_repo}/rev/{revision} " + "title='Built from {repo_name} revision {revision}'>" + "{revision}</a>\n".format( + revision=revision, head_repo=head_repo, repo_name=head_repo.split("/")[-1] + ) + ) + + print_line(b"vcs", msg.encode("utf-8")) + + return revision + + +def fetch_artifacts(): + print_line(b"fetches", b"fetching artifacts\n") + + fetch_content = shutil.which("fetch-content") + + if not fetch_content or not os.path.isfile(fetch_content): + fetch_content = os.path.join(os.path.dirname(__file__), "fetch-content") + + if not os.path.isfile(fetch_content): + print(FETCH_CONTENT_NOT_FOUND) + sys.exit(1) + + cmd = [sys.executable, "-u", fetch_content, "task-artifacts"] + print_line(b"fetches", b"executing %r\n" % cmd) + subprocess.run(cmd, check=True, env=os.environ) + print_line(b"fetches", b"finished fetching artifacts\n") + + +def add_vcs_arguments(parser, project, name): + """Adds arguments to ArgumentParser to control VCS options for a project.""" + + parser.add_argument( + "--%s-checkout" % project, + help="Directory where %s checkout should be created" % name, + ) + parser.add_argument( + "--%s-sparse-profile" % project, + help="Path to sparse profile for %s checkout" % name, + ) + + +def collect_vcs_options(args, project, name): + checkout = getattr(args, "%s_checkout" % project) + sparse_profile = getattr(args, "%s_sparse_profile" % project) + + env_prefix = project.upper() + + repo_type = os.environ.get("%s_REPOSITORY_TYPE" % env_prefix) + base_repo = os.environ.get("%s_BASE_REPOSITORY" % env_prefix) + base_ref = os.environ.get("%s_BASE_REF" % env_prefix) + base_rev = os.environ.get("%s_BASE_REV" % env_prefix) + head_repo = os.environ.get("%s_HEAD_REPOSITORY" % env_prefix) + revision = os.environ.get("%s_HEAD_REV" % env_prefix) + ref = os.environ.get("%s_HEAD_REF" % env_prefix) + pip_requirements = os.environ.get("%s_PIP_REQUIREMENTS" % env_prefix) + private_key_secret = os.environ.get("%s_SSH_SECRET_NAME" % env_prefix) + + store_path = os.environ.get("HG_STORE_PATH") + + # Expand ~ in some paths. + if checkout: + checkout = os.path.abspath(os.path.expanduser(checkout)) + if store_path: + store_path = os.path.abspath(os.path.expanduser(store_path)) + + if pip_requirements: + pip_requirements = os.path.join(checkout, pip_requirements) + + # Some callers set the base repository to mozilla-central for historical + # reasons. Switch to mozilla-unified because robustcheckout works best + # with it. + if base_repo == "https://hg.mozilla.org/mozilla-central": + base_repo = "https://hg.mozilla.org/mozilla-unified" + + return { + "store-path": store_path, + "project": project, + "name": name, + "env-prefix": env_prefix, + "checkout": checkout, + "sparse-profile": sparse_profile, + "base-repo": base_repo, + "base-ref": base_ref, + "base-rev": base_rev, + "head-repo": head_repo, + "revision": revision, + "ref": ref, + "repo-type": repo_type, + "ssh-secret-name": private_key_secret, + "pip-requirements": pip_requirements, + } + + +def vcs_checkout_from_args(options): + if not options["checkout"]: + if options["ref"] and not options["revision"]: + print("task should be defined in terms of non-symbolic revision") + sys.exit(1) + return + + revision = options["revision"] + ref = options["ref"] + ssh_key_file = None + ssh_known_hosts_file = None + ssh_dir = None + + try: + if options.get("ssh-secret-name"): + ssh_dir = Path("~/.ssh-run-task").expanduser() + os.makedirs(ssh_dir, 0o700) + ssh_key_file = ssh_dir.joinpath("private_ssh_key") + ssh_key = fetch_ssh_secret(options["ssh-secret-name"]) + # We don't use write_text here, to avoid \n -> \r\n on windows + ssh_key_file.write_bytes(ssh_key.encode("ascii")) + ssh_key_file.chmod(0o600) + # TODO: We should pull this from a secret, so it can be updated on old trees + ssh_known_hosts_file = ssh_dir.joinpath("known_hosts") + ssh_known_hosts_file.write_bytes(GITHUB_SSH_FINGERPRINT) + + if options["repo-type"] == "git": + if not revision and not ref: + raise RuntimeError( + "Git requires that either a ref, a revision, or both are provided" + ) + + if not ref: + print("Providing a ref will improve the performance of this checkout") + + revision = git_checkout( + options["checkout"], + options["head-repo"], + options["base-repo"], + options["base-ref"], + options["base-rev"], + ref, + revision, + ssh_key_file, + ssh_known_hosts_file, + ) + elif options["repo-type"] == "hg": + if not revision and not ref: + raise RuntimeError( + "Hg requires that at least one of a ref or revision " "is provided" + ) + + revision = hg_checkout( + options["checkout"], + options["head-repo"], + options["base-repo"], + options["store-path"], + options["sparse-profile"], + ref, + revision, + ) + else: + raise RuntimeError('Type of VCS must be either "git" or "hg"') + finally: + if ssh_dir: + shutil.rmtree(ssh_dir, ignore_errors=True) + pass + + os.environ["%s_HEAD_REV" % options["env-prefix"]] = revision + + +def install_pip_requirements(repositories): + """Install pip requirements files from specified repositories, if necessary.""" + requirements = [ + r["pip-requirements"] for r in repositories if r["pip-requirements"] + ] + if not requirements: + return + + cmd = [sys.executable, "-mpip", "install"] + if os.environ.get("PIP_DISABLE_REQUIRE_HASHES") != "1": + cmd.append("--require-hashes") + + for path in requirements: + cmd.extend(["-r", path]) + + run_required_command(b"pip-install", cmd) + + +def maybe_run_resource_monitoring(): + """Run the resource monitor if available. + + Discussion in https://github.com/taskcluster/taskcluster-rfcs/pull/160 + and https://bugzil.la/1648051 + + """ + if "MOZ_FETCHES" not in os.environ: + return + if "RESOURCE_MONITOR_OUTPUT" not in os.environ: + return + + prefix = b"resource_monitor" + + executable = "{}/resource-monitor/resource-monitor{}".format( + os.environ.get("MOZ_FETCHES_DIR"), ".exe" if IS_WINDOWS else "" + ) + + if not os.path.exists(executable) or not os.access(executable, os.X_OK): + print_line(prefix, b"%s not executable\n" % executable.encode("utf-8")) + return + args = [ + executable, + "-process", + str(os.getpid()), + "-output", + os.environ["RESOURCE_MONITOR_OUTPUT"], + ] + print_line(prefix, b"Resource monitor starting: %s\n" % str(args).encode("utf-8")) + # Avoid environment variables the payload doesn't need. + del os.environ["RESOURCE_MONITOR_OUTPUT"] + + # Without CREATE_NEW_PROCESS_GROUP Windows signals will attempt to kill run-task, too. + process = subprocess.Popen( + args, + # Disable buffering because we want to receive output + # as it is generated so timestamps in logs are + # accurate. + bufsize=0, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + creationflags=subprocess.CREATE_NEW_PROCESS_GROUP if IS_WINDOWS else 0, + cwd=os.getcwd(), + ) + + def capture_output(): + fh = io.TextIOWrapper(process.stdout, encoding="latin1") + while True: + data = fh.readline().encode("latin1") + if data == b"": + break + print_line(prefix, data) + + monitor_process = Thread(target=capture_output) + monitor_process.start() + return process + + +def _display_python_version(): + print_line( + b"setup", b"Python version: %s\n" % platform.python_version().encode("utf-8") + ) + + +def main(args): + os.environ["TASK_WORKDIR"] = os.getcwd() + print_line( + b"setup", + b"run-task started in %s\n" % os.environ["TASK_WORKDIR"].encode("utf-8"), + ) + print_line( + b"setup", + b"Invoked by command: %s\n" % " ".join(args).encode("utf-8"), + ) + _display_python_version() + running_as_root = IS_POSIX and os.getuid() == 0 + + # Arguments up to '--' are ours. After are for the main task + # to be executed. + try: + i = args.index("--") + our_args = args[0:i] + task_args = args[i + 1 :] + except ValueError: + our_args = args + task_args = [] + + parser = argparse.ArgumentParser() + parser.add_argument("--user", default="worker", help="user to run as") + parser.add_argument("--group", default="worker", help="group to run as") + parser.add_argument("--task-cwd", help="directory to run the provided command in") + + repositories = os.environ.get("REPOSITORIES") + if repositories: + repositories = json.loads(repositories) + else: + repositories = {"vcs": "repository"} + + for repository, name in repositories.items(): + add_vcs_arguments(parser, repository, name) + + parser.add_argument( + "--fetch-hgfingerprint", action="store_true", help=argparse.SUPPRESS + ) + + args = parser.parse_args(our_args) + + repositories = [ + collect_vcs_options(args, repository, name) + for (repository, name) in repositories.items() + ] + # Sort repositories so that parent checkout paths come before children + repositories.sort(key=lambda repo: Path(repo["checkout"] or "/").parts) + + uid = gid = gids = user = group = None + if IS_POSIX and running_as_root: + user, group, gids = get_posix_user_group(args.user, args.group) + uid = user.pw_uid + gid = group.gr_gid + + if running_as_root and os.path.exists("/dev/kvm"): + # Ensure kvm permissions for worker, required for Android x86 + st = os.stat("/dev/kvm") + os.chmod("/dev/kvm", st.st_mode | 0o666) + + # Validate caches. + # + # Taskgraph should pass in a list of paths that are caches via an + # environment variable (which we don't want to pass down to child + # processes). + + if "TASKCLUSTER_CACHES" in os.environ: + caches = os.environ["TASKCLUSTER_CACHES"].split(";") + del os.environ["TASKCLUSTER_CACHES"] + else: + caches = [] + + if "TASKCLUSTER_UNTRUSTED_CACHES" in os.environ: + untrusted_caches = True + del os.environ["TASKCLUSTER_UNTRUSTED_CACHES"] + else: + untrusted_caches = False + + for cache in caches: + if not os.path.isdir(cache): + print( + "error: cache %s is not a directory; this should never " + "happen" % cache + ) + return 1 + + purge = configure_cache_posix( + cache, user, group, untrusted_caches, running_as_root + ) + + if purge: + return EXIT_PURGE_CACHE + + if "TASKCLUSTER_VOLUMES" in os.environ: + volumes = os.environ["TASKCLUSTER_VOLUMES"].split(";") + del os.environ["TASKCLUSTER_VOLUMES"] + else: + volumes = [] + + if volumes and not IS_POSIX: + print("assertion failed: volumes not expected on Windows") + return 1 + + # Sanitize volumes. + for volume in volumes: + # If a volume is a cache, it was dealt with above. + if volume in caches: + print_line(b"volume", b"volume %s is a cache\n" % volume.encode("utf-8")) + continue + + configure_volume_posix(volume, user, group, running_as_root) + + all_caches_and_volumes = set(map(os.path.normpath, caches)) + all_caches_and_volumes |= set(map(os.path.normpath, volumes)) + + def path_in_cache_or_volume(path): + path = os.path.normpath(path) + + while path: + if path in all_caches_and_volumes: + return True + + path, child = os.path.split(path) + if not child: + break + + return False + + def prepare_checkout_dir(checkout): + if not checkout: + return + + # The checkout path becomes the working directory. Since there are + # special cache files in the cache's root directory and working + # directory purging could blow them away, disallow this scenario. + if os.path.exists(os.path.join(checkout, ".cacherequires")): + print("error: cannot perform vcs checkout into cache root: %s" % checkout) + sys.exit(1) + + # TODO given the performance implications, consider making this a fatal + # error. + if not path_in_cache_or_volume(checkout): + print_line( + b"vcs", + b"WARNING: vcs checkout path (%s) not in cache " + b"or volume; performance will likely suffer\n" + % checkout.encode("utf-8"), + ) + + # Ensure the directory for the source checkout exists. + try: + os.makedirs(os.path.dirname(checkout)) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + # And that it is owned by the appropriate user/group. + if running_as_root: + os.chown(os.path.dirname(checkout), uid, gid) + + def prepare_hg_store_path(): + # And ensure the shared store path exists and has proper permissions. + if "HG_STORE_PATH" not in os.environ: + print("error: HG_STORE_PATH environment variable not set") + sys.exit(1) + + store_path = os.environ["HG_STORE_PATH"] + + if not path_in_cache_or_volume(store_path): + print_line( + b"vcs", + b"WARNING: HG_STORE_PATH (%s) not in cache or " + b"volume; performance will likely suffer\n" + % store_path.encode("utf-8"), + ) + + try: + os.makedirs(store_path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + if running_as_root: + os.chown(store_path, uid, gid) + + repository_paths = [ + Path(repo["checkout"]) for repo in repositories if repo["checkout"] + ] + for repo in repositories: + if not repo["checkout"]: + continue + parents = Path(repo["checkout"]).parents + if any((path in repository_paths) for path in parents): + # Skip creating any checkouts that are inside other checokuts + continue + prepare_checkout_dir(repo["checkout"]) + + if any(repo["checkout"] and repo["repo-type"] == "hg" for repo in repositories): + prepare_hg_store_path() + + if IS_POSIX and running_as_root: + # Drop permissions to requested user. + # This code is modeled after what `sudo` was observed to do in a Docker + # container. We do not bother calling setrlimit() because containers have + # their own limits. + print_line( + b"setup", + b"running as %s:%s\n" + % (args.user.encode("utf-8"), args.group.encode("utf-8")), + ) + + os.setgroups(gids) + os.umask(0o22) + os.setresgid(gid, gid, gid) + os.setresuid(uid, uid, uid) + + for repo in repositories: + vcs_checkout_from_args(repo) + + resource_process = None + + try: + for k in ["MOZ_FETCHES_DIR", "UPLOAD_DIR"] + [ + "{}_PATH".format(repository["project"].upper()) + for repository in repositories + ]: + if k in os.environ: + os.environ[k] = os.path.abspath(os.environ[k]) + print_line( + b"setup", + b"%s is %s\n" % (k.encode("utf-8"), os.environ[k].encode("utf-8")), + ) + + if "MOZ_FETCHES" in os.environ: + fetch_artifacts() + + # Install Python requirements after fetches in case tasks want to use + # fetches to grab dependencies. + install_pip_requirements(repositories) + + resource_process = maybe_run_resource_monitoring() + + return run_command(b"task", task_args, cwd=args.task_cwd) + finally: + if resource_process: + print_line(b"resource_monitor", b"terminating\n") + if IS_WINDOWS: + # .terminate() on Windows is not a graceful shutdown, due to + # differences in signals. CTRL_BREAK_EVENT will work provided + # the subprocess is in a different process group, so this script + # isn't also killed. + os.kill(resource_process.pid, signal.CTRL_BREAK_EVENT) + else: + resource_process.terminate() + resource_process.wait() + fetches_dir = os.environ.get("MOZ_FETCHES_DIR") + if fetches_dir and os.path.isdir(fetches_dir): + print_line(b"fetches", b"removing %s\n" % fetches_dir.encode("utf-8")) + remove(fetches_dir) + print_line(b"fetches", b"finished\n") + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/target_tasks.py b/third_party/python/taskcluster_taskgraph/taskgraph/target_tasks.py new file mode 100644 index 0000000000..1119a1c960 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/target_tasks.py @@ -0,0 +1,107 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +from taskgraph.util.attributes import ( + match_run_on_git_branches, + match_run_on_projects, + match_run_on_tasks_for, +) + +_target_task_methods = {} + +_GIT_REFS_HEADS_PREFIX = "refs/heads/" + + +def _target_task(name): + def wrap(func): + _target_task_methods[name] = func + return func + + return wrap + + +def get_method(method): + """Get a target_task_method to pass to a TaskGraphGenerator.""" + return _target_task_methods[method] + + +def filter_out_cron(task, parameters): + """ + Filter out tasks that run via cron. + """ + return not task.attributes.get("cron") + + +def filter_for_project(task, parameters): + """Filter tasks by project. Optionally enable nightlies.""" + run_on_projects = set(task.attributes.get("run_on_projects", [])) + return match_run_on_projects(parameters["project"], run_on_projects) + + +def filter_for_tasks_for(task, parameters): + run_on_tasks_for = set(task.attributes.get("run_on_tasks_for", ["all"])) + return match_run_on_tasks_for(parameters["tasks_for"], run_on_tasks_for) + + +def filter_for_git_branch(task, parameters): + """Filter tasks by git branch. + If `run_on_git_branch` is not defined, then task runs on all branches""" + # We cannot filter out on git branches if we not on a git repository + if parameters.get("repository_type") != "git": + return True + + # Pull requests usually have arbitrary names, let's not filter git branches on them. + if parameters["tasks_for"] == "github-pull-request": + return True + + run_on_git_branches = set(task.attributes.get("run_on_git_branches", ["all"])) + git_branch = parameters["head_ref"] + if git_branch.startswith(_GIT_REFS_HEADS_PREFIX): + git_branch = git_branch[len(_GIT_REFS_HEADS_PREFIX) :] + + return match_run_on_git_branches(git_branch, run_on_git_branches) + + +def filter_out_shipping_phase(task, parameters): + return task.attributes.get("shipping_phase") in (None, "build") + + +def standard_filter(task, parameters): + return all( + filter_func(task, parameters) + for filter_func in ( + filter_out_cron, + filter_out_shipping_phase, + filter_for_project, + filter_for_tasks_for, + filter_for_git_branch, + ) + ) + + +@_target_task("default") +def target_tasks_default(full_task_graph, parameters, graph_config): + """Target the tasks which have indicated they should be run on this project + via the `run_on_projects` attributes.""" + return [ + l for l, t in full_task_graph.tasks.items() if standard_filter(t, parameters) + ] + + +@_target_task("codereview") +def target_tasks_codereview(full_task_graph, parameters, graph_config): + """Target the tasks which have indicated they should be run on this project + via the `run_on_projects` attributes.""" + return [ + l + for l, t in full_task_graph.tasks.items() + if standard_filter(t, parameters) and t.attributes.get("code-review") + ] + + +@_target_task("nothing") +def target_tasks_nothing(full_task_graph, parameters, graph_config): + """Select nothing, for DONTBUILD pushes""" + return [] diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/task.py b/third_party/python/taskcluster_taskgraph/taskgraph/task.py new file mode 100644 index 0000000000..45427ac4f7 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/task.py @@ -0,0 +1,84 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Union + + +@dataclass +class Task: + """ + Representation of a task in a TaskGraph. Each Task has, at creation: + + - kind: the name of the task kind + - label; the label for this task + - attributes: a dictionary of attributes for this task (used for filtering) + - task: the task definition (JSON-able dictionary) + - optimization: optimization to apply to the task (see taskgraph.optimize) + - dependencies: tasks this one depends on, in the form {name: label}, for example + {'build': 'build-linux64/opt', 'docker-image': 'build-docker-image-desktop-test'} + - soft_dependencies: tasks this one may depend on if they are available post + optimisation. They are set as a list of tasks label. + - if_dependencies: only run this task if at least one of these dependencies + are present. + + And later, as the task-graph processing proceeds: + + - task_id -- TaskCluster taskId under which this task will be created + + This class is just a convenience wrapper for the data type and managing + display, comparison, serialization, etc. It has no functionality of its own. + """ + + kind: str + label: str + attributes: Dict + task: Dict + description: str = "" + task_id: Union[str, None] = field(default=None, init=False) + optimization: Union[Dict[str, Any], None] = field(default=None) + dependencies: Dict = field(default_factory=dict) + soft_dependencies: List = field(default_factory=list) + if_dependencies: List = field(default_factory=list) + + def __post_init__(self): + self.attributes["kind"] = self.kind + + def to_json(self): + rv = { + "kind": self.kind, + "label": self.label, + "description": self.description, + "attributes": self.attributes, + "dependencies": self.dependencies, + "soft_dependencies": self.soft_dependencies, + "if_dependencies": self.if_dependencies, + "optimization": self.optimization, + "task": self.task, + } + if self.task_id: + rv["task_id"] = self.task_id + return rv + + @classmethod + def from_json(cls, task_dict): + """ + Given a data structure as produced by taskgraph.to_json, re-construct + the original Task object. This is used to "resume" the task-graph + generation process, for example in Action tasks. + """ + rv = cls( + kind=task_dict["kind"], + label=task_dict["label"], + description=task_dict.get("description", ""), + attributes=task_dict["attributes"], + task=task_dict["task"], + optimization=task_dict["optimization"], + dependencies=task_dict.get("dependencies"), + soft_dependencies=task_dict.get("soft_dependencies"), + if_dependencies=task_dict.get("if_dependencies"), + ) + if "task_id" in task_dict: + rv.task_id = task_dict["task_id"] + return rv diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/taskgraph.py b/third_party/python/taskcluster_taskgraph/taskgraph/taskgraph.py new file mode 100644 index 0000000000..e479a7cf15 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/taskgraph.py @@ -0,0 +1,72 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from dataclasses import dataclass +from typing import List + +from .graph import Graph +from .task import Task + + +@dataclass(frozen=True) +class TaskGraph: + """ + Representation of a task graph. + + A task graph is a combination of a Graph and a dictionary of tasks indexed + by label. TaskGraph instances should be treated as immutable. + + In the graph, tasks are said to "link to" their dependencies. Whereas + tasks are "linked from" their dependents. + """ + + tasks: List[Task] + graph: Graph + + def __post_init__(self): + assert set(self.tasks) == self.graph.nodes + + def for_each_task(self, f, *args, **kwargs): + for task_label in self.graph.visit_postorder(): + task = self.tasks[task_label] + f(task, self, *args, **kwargs) + + def __getitem__(self, label): + "Get a task by label" + return self.tasks[label] + + def __contains__(self, label): + return label in self.tasks + + def __iter__(self): + "Iterate over tasks in undefined order" + return iter(self.tasks.values()) + + def to_json(self): + "Return a JSON-able object representing the task graph, as documented" + named_links_dict = self.graph.named_links_dict() + # this dictionary may be keyed by label or by taskid, so let's just call it 'key' + tasks = {} + for key in self.graph.visit_postorder(): + tasks[key] = self.tasks[key].to_json() + # overwrite dependencies with the information in the taskgraph's edges. + tasks[key]["dependencies"] = named_links_dict.get(key, {}) + return tasks + + @classmethod + def from_json(cls, tasks_dict): + """ + This code is used to generate the a TaskGraph using a dictionary + which is representative of the TaskGraph. + """ + tasks = {} + edges = set() + for key, value in tasks_dict.items(): + tasks[key] = Task.from_json(value) + if "task_id" in value: + tasks[key].task_id = value["task_id"] + for depname, dep in value["dependencies"].items(): + edges.add((key, dep, depname)) + task_graph = cls(tasks, Graph(set(tasks), edges)) + return tasks, task_graph diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/__init__.py new file mode 100644 index 0000000000..4fa7b5fc0c --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/__init__.py @@ -0,0 +1,3 @@ +from taskgraph.transforms import ( # noqa: Added for backwards compat + notify as release_notifications, +) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/base.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/base.py new file mode 100644 index 0000000000..e6fcd2400c --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/base.py @@ -0,0 +1,158 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import re +from dataclasses import dataclass, field +from typing import Dict, List, Union + +from taskgraph.task import Task + +from ..config import GraphConfig +from ..parameters import Parameters +from ..util.memoize import memoize +from ..util.schema import Schema, validate_schema + + +@dataclass(frozen=True) +class RepoConfig: + prefix: str + name: str + base_repository: str + head_repository: str + head_ref: str + type: str + path: str = "" + head_rev: Union[str, None] = None + ssh_secret_name: Union[str, None] = None + + +@dataclass(frozen=True, eq=False) +class TransformConfig: + """ + A container for configuration affecting transforms. The `config` argument + to transforms is an instance of this class. + """ + + # the name of the current kind + kind: str + + # the path to the kind configuration directory + path: str + + # the parsed contents of kind.yml + config: Dict + + # the parameters for this task-graph generation run + params: Parameters + + # a dict of all the tasks associated with the kind dependencies of the + # current kind + kind_dependencies_tasks: Dict[str, Task] + + # Global configuration of the taskgraph + graph_config: GraphConfig + + # whether to write out artifacts for the decision task + write_artifacts: bool + + @property + @memoize + def repo_configs(self): + repositories = self.graph_config["taskgraph"]["repositories"] + if len(repositories) == 1: + current_prefix = list(repositories.keys())[0] + else: + project = self.params["project"] + matching_repos = { + repo_prefix: repo + for (repo_prefix, repo) in repositories.items() + if re.match(repo["project-regex"], project) + } + if len(matching_repos) != 1: + raise Exception( + f"Couldn't find repository matching project `{project}`" + ) + current_prefix = list(matching_repos.keys())[0] + + repo_configs = { + current_prefix: RepoConfig( + prefix=current_prefix, + name=repositories[current_prefix]["name"], + base_repository=self.params["base_repository"], + head_repository=self.params["head_repository"], + head_ref=self.params["head_ref"], + head_rev=self.params["head_rev"], + type=self.params["repository_type"], + ssh_secret_name=repositories[current_prefix].get("ssh-secret-name"), + ), + } + if len(repositories) != 1: + repo_configs.update( + { + repo_prefix: RepoConfig( + prefix=repo_prefix, + name=repo["name"], + base_repository=repo["default-repository"], + head_repository=repo["default-repository"], + head_ref=repo["default-ref"], + type=repo["type"], + ssh_secret_name=repo.get("ssh-secret-name"), + ) + for (repo_prefix, repo) in repositories.items() + if repo_prefix != current_prefix + } + ) + return repo_configs + + +@dataclass() +class TransformSequence: + """ + Container for a sequence of transforms. Each transform is represented as a + callable taking (config, items) and returning a generator which will yield + transformed items. The resulting sequence has the same interface. + + This is convenient to use in a file full of transforms, as it provides a + decorator, @transforms.add, that will add the decorated function to the + sequence. + """ + + _transforms: List = field(default_factory=list) + + def __call__(self, config, items): + for xform in self._transforms: + items = xform(config, items) + if items is None: + raise Exception(f"Transform {xform} is not a generator") + return items + + def add(self, func): + self._transforms.append(func) + return func + + def add_validate(self, schema): + self.add(ValidateSchema(schema)) + + +@dataclass +class ValidateSchema: + schema: Schema + + def __call__(self, config, tasks): + for task in tasks: + if "name" in task: + error = "In {kind} kind task {name!r}:".format( + kind=config.kind, name=task["name"] + ) + elif "label" in task: + error = "In job {label!r}:".format(label=task["label"]) + elif "primary-dependency" in task: + error = "In {kind} kind task for {dependency!r}:".format( + kind=config.kind, dependency=task["primary-dependency"].label + ) + else: + error = "In unknown task:" + validate_schema(self.schema, task, error) + yield task diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/cached_tasks.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/cached_tasks.py new file mode 100644 index 0000000000..57a55dffb3 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/cached_tasks.py @@ -0,0 +1,90 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +from collections import deque + +import taskgraph +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.cached_tasks import add_optimization + +transforms = TransformSequence() + + +def order_tasks(config, tasks): + """Iterate image tasks in an order where parent tasks come first.""" + if config.kind == "docker-image": + kind_prefix = "build-docker-image-" + else: + kind_prefix = config.kind + "-" + + pending = deque(tasks) + task_labels = {task["label"] for task in pending} + emitted = set() + while True: + try: + task = pending.popleft() + except IndexError: + break + parents = { + task + for task in task.get("dependencies", {}).values() + if task.startswith(kind_prefix) + } + if parents and not emitted.issuperset(parents & task_labels): + pending.append(task) + continue + emitted.add(task["label"]) + yield task + + +def format_task_digest(cached_task): + return "/".join( + [ + cached_task["type"], + cached_task["name"], + cached_task["digest"], + ] + ) + + +@transforms.add +def cache_task(config, tasks): + if taskgraph.fast: + for task in tasks: + yield task + return + + digests = {} + for task in config.kind_dependencies_tasks.values(): + if "cached_task" in task.attributes: + digests[task.label] = format_task_digest(task.attributes["cached_task"]) + + for task in order_tasks(config, tasks): + cache = task.pop("cache", None) + if cache is None: + yield task + continue + + dependency_digests = [] + for p in task.get("dependencies", {}).values(): + if p in digests: + dependency_digests.append(digests[p]) + else: + raise Exception( + "Cached task {} has uncached parent task: {}".format( + task["label"], p + ) + ) + digest_data = cache["digest-data"] + sorted(dependency_digests) + add_optimization( + config, + task, + cache_type=cache["type"], + cache_name=cache["name"], + digest_data=digest_data, + ) + digests[task["label"]] = format_task_digest(task["attributes"]["cached_task"]) + + yield task diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/chunking.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/chunking.py new file mode 100644 index 0000000000..31d7eff82c --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/chunking.py @@ -0,0 +1,82 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +import copy +from textwrap import dedent + +from voluptuous import ALLOW_EXTRA, Optional, Required + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import Schema +from taskgraph.util.templates import substitute + +CHUNK_SCHEMA = Schema( + { + # Optional, so it can be used for a subset of tasks in a kind + Optional( + "chunk", + description=dedent( + """ + `chunk` can be used to split one task into `total-chunks` + tasks, substituting `this_chunk` and `total_chunks` into any + fields in `substitution-fields`. + """.lstrip() + ), + ): { + Required( + "total-chunks", + description=dedent( + """ + The total number of chunks to split the task into. + """.lstrip() + ), + ): int, + Optional( + "substitution-fields", + description=dedent( + """ + A list of fields that need to have `{this_chunk}` and/or + `{total_chunks}` replaced in them. + """.lstrip() + ), + ): [str], + } + }, + extra=ALLOW_EXTRA, +) + +transforms = TransformSequence() +transforms.add_validate(CHUNK_SCHEMA) + + +@transforms.add +def chunk_tasks(config, tasks): + for task in tasks: + chunk_config = task.pop("chunk", None) + if not chunk_config: + yield task + continue + + total_chunks = chunk_config["total-chunks"] + + for this_chunk in range(1, total_chunks + 1): + subtask = copy.deepcopy(task) + + subs = { + "this_chunk": this_chunk, + "total_chunks": total_chunks, + } + subtask.setdefault("attributes", {}) + subtask["attributes"].update(subs) + + for field in chunk_config["substitution-fields"]: + container, subfield = subtask, field + while "." in subfield: + f, subfield = subfield.split(".", 1) + container = container[f] + + subcontainer = copy.deepcopy(container[subfield]) + subfield = substitute(subfield, **subs) + container[subfield] = substitute(subcontainer, **subs) + + yield subtask diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/code_review.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/code_review.py new file mode 100644 index 0000000000..bdb655b97d --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/code_review.py @@ -0,0 +1,23 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Add soft dependencies and configuration to code-review tasks. +""" + + +from taskgraph.transforms.base import TransformSequence + +transforms = TransformSequence() + + +@transforms.add +def add_dependencies(config, jobs): + for job in jobs: + job.setdefault("soft-dependencies", []) + job["soft-dependencies"] += [ + dep_task.label + for dep_task in config.kind_dependencies_tasks.values() + if dep_task.attributes.get("code-review") is True + ] + yield job diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/docker_image.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/docker_image.py new file mode 100644 index 0000000000..d0c5b9c97b --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/docker_image.py @@ -0,0 +1,214 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import json +import logging +import os +import re + +from voluptuous import Optional, Required + +import taskgraph +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.docker import create_context_tar, generate_context_hash +from taskgraph.util.schema import Schema + +from .task import task_description_schema + +logger = logging.getLogger(__name__) + +CONTEXTS_DIR = "docker-contexts" + +DIGEST_RE = re.compile("^[0-9a-f]{64}$") + +IMAGE_BUILDER_IMAGE = ( + "mozillareleases/image_builder:5.0.0" + "@sha256:" + "e510a9a9b80385f71c112d61b2f2053da625aff2b6d430411ac42e424c58953f" +) + +transforms = TransformSequence() + +docker_image_schema = Schema( + { + # Name of the docker image. + Required("name"): str, + # Name of the parent docker image. + Optional("parent"): str, + # Treeherder symbol. + Optional("symbol"): str, + # relative path (from config.path) to the file the docker image was defined + # in. + Optional("task-from"): str, + # Arguments to use for the Dockerfile. + Optional("args"): {str: str}, + # Name of the docker image definition under taskcluster/docker, when + # different from the docker image name. + Optional("definition"): str, + # List of package tasks this docker image depends on. + Optional("packages"): [str], + Optional( + "index", + description="information for indexing this build so its artifacts can be discovered", + ): task_description_schema["index"], + Optional( + "cache", + description="Whether this image should be cached based on inputs.", + ): bool, + } +) + + +transforms.add_validate(docker_image_schema) + + +@transforms.add +def fill_template(config, tasks): + available_packages = set() + for task in config.kind_dependencies_tasks.values(): + if task.kind != "packages": + continue + name = task.label.replace("packages-", "") + available_packages.add(name) + + context_hashes = {} + + tasks = list(tasks) + + if not taskgraph.fast and config.write_artifacts: + if not os.path.isdir(CONTEXTS_DIR): + os.makedirs(CONTEXTS_DIR) + + for task in tasks: + image_name = task.pop("name") + job_symbol = task.pop("symbol", None) + args = task.pop("args", {}) + definition = task.pop("definition", image_name) + packages = task.pop("packages", []) + parent = task.pop("parent", None) + + for p in packages: + if p not in available_packages: + raise Exception( + "Missing package job for {}-{}: {}".format( + config.kind, image_name, p + ) + ) + + if not taskgraph.fast: + context_path = os.path.join("taskcluster", "docker", definition) + topsrcdir = os.path.dirname(config.graph_config.taskcluster_yml) + if config.write_artifacts: + context_file = os.path.join(CONTEXTS_DIR, f"{image_name}.tar.gz") + logger.info(f"Writing {context_file} for docker image {image_name}") + context_hash = create_context_tar( + topsrcdir, + context_path, + context_file, + args, + ) + else: + context_hash = generate_context_hash(topsrcdir, context_path, args) + else: + if config.write_artifacts: + raise Exception("Can't write artifacts if `taskgraph.fast` is set.") + context_hash = "0" * 40 + digest_data = [context_hash] + digest_data += [json.dumps(args, sort_keys=True)] + context_hashes[image_name] = context_hash + + description = "Build the docker image {} for use by dependent tasks".format( + image_name + ) + + args["DOCKER_IMAGE_PACKAGES"] = " ".join(f"<{p}>" for p in packages) + + # Adjust the zstandard compression level based on the execution level. + # We use faster compression for level 1 because we care more about + # end-to-end times. We use slower/better compression for other levels + # because images are read more often and it is worth the trade-off to + # burn more CPU once to reduce image size. + zstd_level = "3" if int(config.params["level"]) == 1 else "10" + + # include some information that is useful in reconstructing this task + # from JSON + taskdesc = { + "label": "build-docker-image-" + image_name, + "description": description, + "attributes": { + "image_name": image_name, + "artifact_prefix": "public", + }, + "always-target": True, + "expires-after": "28 days" if config.params.is_try() else "1 year", + "scopes": [], + "run-on-projects": [], + "worker-type": "images", + "worker": { + "implementation": "docker-worker", + "os": "linux", + "artifacts": [ + { + "type": "file", + "path": "/workspace/image.tar.zst", + "name": "public/image.tar.zst", + } + ], + "env": { + "CONTEXT_TASK_ID": {"task-reference": "<decision>"}, + "CONTEXT_PATH": "public/docker-contexts/{}.tar.gz".format( + image_name + ), + "HASH": context_hash, + "PROJECT": config.params["project"], + "IMAGE_NAME": image_name, + "DOCKER_IMAGE_ZSTD_LEVEL": zstd_level, + "DOCKER_BUILD_ARGS": { + "task-reference": json.dumps(args), + }, + "VCS_BASE_REPOSITORY": config.params["base_repository"], + "VCS_HEAD_REPOSITORY": config.params["head_repository"], + "VCS_HEAD_REV": config.params["head_rev"], + "VCS_REPOSITORY_TYPE": config.params["repository_type"], + }, + "chain-of-trust": True, + "max-run-time": 7200, + }, + } + if "index" in task: + taskdesc["index"] = task["index"] + if job_symbol: + taskdesc["treeherder"] = { + "symbol": job_symbol, + "platform": "taskcluster-images/opt", + "kind": "other", + "tier": 1, + } + + worker = taskdesc["worker"] + + worker["docker-image"] = IMAGE_BUILDER_IMAGE + digest_data.append(f"image-builder-image:{IMAGE_BUILDER_IMAGE}") + + if packages: + deps = taskdesc.setdefault("dependencies", {}) + for p in sorted(packages): + deps[p] = f"packages-{p}" + + if parent: + deps = taskdesc.setdefault("dependencies", {}) + deps["parent"] = f"build-docker-image-{parent}" + worker["env"]["PARENT_TASK_ID"] = { + "task-reference": "<parent>", + } + + if task.get("cache", True) and not taskgraph.fast: + taskdesc["cache"] = { + "type": "docker-images.v2", + "name": image_name, + "digest-data": digest_data, + } + + yield taskdesc diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/fetch.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/fetch.py new file mode 100644 index 0000000000..bcb8ff38a6 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/fetch.py @@ -0,0 +1,336 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Support for running tasks that download remote content and re-export +# it as task artifacts. + + +import os +import re +from dataclasses import dataclass +from typing import Callable + +from voluptuous import Extra, Optional, Required + +import taskgraph + +from ..util import path +from ..util.cached_tasks import add_optimization +from ..util.schema import Schema, validate_schema +from ..util.treeherder import join_symbol +from .base import TransformSequence + +CACHE_TYPE = "content.v1" + +FETCH_SCHEMA = Schema( + { + # Name of the task. + Required("name"): str, + # Relative path (from config.path) to the file the task was defined + # in. + Optional("task-from"): str, + # Description of the task. + Required("description"): str, + Optional("docker-image"): object, + Optional( + "fetch-alias", + description="An alias that can be used instead of the real fetch job name in " + "fetch stanzas for jobs.", + ): str, + Optional( + "artifact-prefix", + description="The prefix of the taskcluster artifact being uploaded. " + "Defaults to `public/`; if it starts with something other than " + "`public/` the artifact will require scopes to access.", + ): str, + Optional("attributes"): {str: object}, + Required("fetch"): { + Required("type"): str, + Extra: object, + }, + } +) + + +# define a collection of payload builders, depending on the worker implementation +fetch_builders = {} + + +@dataclass(frozen=True) +class FetchBuilder: + schema: Schema + builder: Callable + + +def fetch_builder(name, schema): + schema = Schema({Required("type"): name}).extend(schema) + + def wrap(func): + fetch_builders[name] = FetchBuilder(schema, func) + return func + + return wrap + + +transforms = TransformSequence() +transforms.add_validate(FETCH_SCHEMA) + + +@transforms.add +def process_fetch_job(config, jobs): + # Converts fetch-url entries to the job schema. + for job in jobs: + typ = job["fetch"]["type"] + name = job["name"] + fetch = job.pop("fetch") + + if typ not in fetch_builders: + raise Exception(f"Unknown fetch type {typ} in fetch {name}") + validate_schema(fetch_builders[typ].schema, fetch, f"In task.fetch {name!r}:") + + job.update(configure_fetch(config, typ, name, fetch)) + + yield job + + +def configure_fetch(config, typ, name, fetch): + if typ not in fetch_builders: + raise Exception(f"No fetch type {typ} in fetch {name}") + validate_schema(fetch_builders[typ].schema, fetch, f"In task.fetch {name!r}:") + + return fetch_builders[typ].builder(config, name, fetch) + + +@transforms.add +def make_task(config, jobs): + # Fetch tasks are idempotent and immutable. Have them live for + # essentially forever. + if config.params["level"] == "3": + expires = "1000 years" + else: + expires = "28 days" + + for job in jobs: + name = job["name"] + artifact_prefix = job.get("artifact-prefix", "public") + env = job.get("env", {}) + env.update({"UPLOAD_DIR": "/builds/worker/artifacts"}) + attributes = job.get("attributes", {}) + attributes["fetch-artifact"] = path.join(artifact_prefix, job["artifact_name"]) + alias = job.get("fetch-alias") + if alias: + attributes["fetch-alias"] = alias + + task = { + "attributes": attributes, + "name": name, + "description": job["description"], + "expires-after": expires, + "label": "fetch-%s" % name, + "run-on-projects": [], + "run": { + "using": "run-task", + "checkout": False, + "command": job["command"], + }, + "worker-type": "images", + "worker": { + "chain-of-trust": True, + "docker-image": job.get("docker-image", {"in-tree": "fetch"}), + "env": env, + "max-run-time": 900, + "artifacts": [ + { + "type": "directory", + "name": artifact_prefix, + "path": "/builds/worker/artifacts", + } + ], + }, + } + + if "treeherder" in config.graph_config: + task["treeherder"] = { + "symbol": join_symbol("Fetch", name), + "kind": "build", + "platform": "fetch/opt", + "tier": 1, + } + + if job.get("secret", None): + task["scopes"] = ["secrets:get:" + job.get("secret")] + task["worker"]["taskcluster-proxy"] = True + + if not taskgraph.fast: + cache_name = task["label"].replace(f"{config.kind}-", "", 1) + + # This adds the level to the index path automatically. + add_optimization( + config, + task, + cache_type=CACHE_TYPE, + cache_name=cache_name, + digest_data=job["digest_data"], + ) + yield task + + +@fetch_builder( + "static-url", + schema={ + # The URL to download. + Required("url"): str, + # The SHA-256 of the downloaded content. + Required("sha256"): str, + # Size of the downloaded entity, in bytes. + Required("size"): int, + # GPG signature verification. + Optional("gpg-signature"): { + # URL where GPG signature document can be obtained. Can contain the + # value ``{url}``, which will be substituted with the value from + # ``url``. + Required("sig-url"): str, + # Path to file containing GPG public key(s) used to validate + # download. + Required("key-path"): str, + }, + # The name to give to the generated artifact. Defaults to the file + # portion of the URL. Using a different extension converts the + # archive to the given type. Only conversion to .tar.zst is + # supported. + Optional("artifact-name"): str, + # Strip the given number of path components at the beginning of + # each file entry in the archive. + # Requires an artifact-name ending with .tar.zst. + Optional("strip-components"): int, + # Add the given prefix to each file entry in the archive. + # Requires an artifact-name ending with .tar.zst. + Optional("add-prefix"): str, + # Headers to pass alongside the request. + Optional("headers"): { + str: str, + }, + # IMPORTANT: when adding anything that changes the behavior of the task, + # it is important to update the digest data used to compute cache hits. + }, +) +def create_fetch_url_task(config, name, fetch): + artifact_name = fetch.get("artifact-name") + if not artifact_name: + artifact_name = fetch["url"].split("/")[-1] + + command = [ + "fetch-content", + "static-url", + ] + + # Arguments that matter to the cache digest + args = [ + "--sha256", + fetch["sha256"], + "--size", + "%d" % fetch["size"], + ] + + if fetch.get("strip-components"): + args.extend(["--strip-components", "%d" % fetch["strip-components"]]) + + if fetch.get("add-prefix"): + args.extend(["--add-prefix", fetch["add-prefix"]]) + + command.extend(args) + + env = {} + + if "gpg-signature" in fetch: + sig_url = fetch["gpg-signature"]["sig-url"].format(url=fetch["url"]) + key_path = os.path.join(taskgraph.GECKO, fetch["gpg-signature"]["key-path"]) + + with open(key_path) as fh: + gpg_key = fh.read() + + env["FETCH_GPG_KEY"] = gpg_key + command.extend( + [ + "--gpg-sig-url", + sig_url, + "--gpg-key-env", + "FETCH_GPG_KEY", + ] + ) + + if "headers" in fetch: + for k, v in fetch["headers"].items(): + command.extend(["-H", f"{k}:{v}"]) + + command.extend( + [ + fetch["url"], + "/builds/worker/artifacts/%s" % artifact_name, + ] + ) + + return { + "command": command, + "artifact_name": artifact_name, + "env": env, + # We don't include the GPG signature in the digest because it isn't + # materially important for caching: GPG signatures are supplemental + # trust checking beyond what the shasum already provides. + "digest_data": args + [artifact_name], + } + + +@fetch_builder( + "git", + schema={ + Required("repo"): str, + Required("revision"): str, + Optional("include-dot-git"): bool, + Optional("artifact-name"): str, + Optional("path-prefix"): str, + # ssh-key is a taskcluster secret path (e.g. project/civet/github-deploy-key) + # In the secret dictionary, the key should be specified as + # "ssh_privkey": "-----BEGIN OPENSSH PRIVATE KEY-----\nkfksnb3jc..." + # n.b. The OpenSSH private key file format requires a newline at the end of the file. + Optional("ssh-key"): str, + }, +) +def create_git_fetch_task(config, name, fetch): + path_prefix = fetch.get("path-prefix") + if not path_prefix: + path_prefix = fetch["repo"].rstrip("/").rsplit("/", 1)[-1] + artifact_name = fetch.get("artifact-name") + if not artifact_name: + artifact_name = f"{path_prefix}.tar.zst" + + if not re.match(r"[0-9a-fA-F]{40}", fetch["revision"]): + raise Exception(f'Revision is not a sha1 in fetch task "{name}"') + + args = [ + "fetch-content", + "git-checkout-archive", + "--path-prefix", + path_prefix, + fetch["repo"], + fetch["revision"], + "/builds/worker/artifacts/%s" % artifact_name, + ] + + ssh_key = fetch.get("ssh-key") + if ssh_key: + args.append("--ssh-key-secret") + args.append(ssh_key) + + digest_data = [fetch["revision"], path_prefix, artifact_name] + if fetch.get("include-dot-git", False): + args.append("--include-dot-git") + digest_data.append(".git") + + return { + "command": args, + "artifact_name": artifact_name, + "digest_data": digest_data, + "secret": ssh_key, + } diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/from_deps.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/from_deps.py new file mode 100644 index 0000000000..337d68e4ba --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/from_deps.py @@ -0,0 +1,242 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Transforms used to create tasks based on the kind dependencies, filtering on +common attributes like the ``build-type``. + +These transforms are useful when follow-up tasks are needed for some +indeterminate subset of existing tasks. For example, running a signing task +after each build task, whatever builds may exist. +""" +from copy import deepcopy +from textwrap import dedent + +from voluptuous import Any, Extra, Optional, Required + +from taskgraph.transforms.base import TransformSequence +from taskgraph.transforms.job import fetches_schema +from taskgraph.util.attributes import attrmatch +from taskgraph.util.dependencies import GROUP_BY_MAP, get_dependencies +from taskgraph.util.schema import Schema, validate_schema + +FROM_DEPS_SCHEMA = Schema( + { + Required("from-deps"): { + Optional( + "kinds", + description=dedent( + """ + Limit dependencies to specified kinds (defaults to all kinds in + `kind-dependencies`). + + The first kind in the list is the "primary" kind. The + dependency of this kind will be used to derive the label + and copy attributes (if `copy-attributes` is True). + """.lstrip() + ), + ): list, + Optional( + "set-name", + description=dedent( + """ + When True, `from_deps` will derive a name for the generated + tasks from the name of the primary dependency. Defaults to + True. + """.lstrip() + ), + ): bool, + Optional( + "with-attributes", + description=dedent( + """ + Limit dependencies to tasks whose attributes match + using :func:`~taskgraph.util.attributes.attrmatch`. + """.lstrip() + ), + ): {str: Any(list, str)}, + Optional( + "group-by", + description=dedent( + """ + Group cross-kind dependencies using the given group-by + function. One task will be created for each group. If not + specified, the 'single' function will be used which creates + a new task for each individual dependency. + """.lstrip() + ), + ): Any( + None, + *GROUP_BY_MAP, + {Any(*GROUP_BY_MAP): object}, + ), + Optional( + "copy-attributes", + description=dedent( + """ + If True, copy attributes from the dependency matching the + first kind in the `kinds` list (whether specified explicitly + or taken from `kind-dependencies`). + """.lstrip() + ), + ): bool, + Optional( + "unique-kinds", + description=dedent( + """ + If true (the default), there must be only a single unique task + for each kind in a dependency group. Setting this to false + disables that requirement. + """.lstrip() + ), + ): bool, + Optional( + "fetches", + description=dedent( + """ + If present, a `fetches` entry will be added for each task + dependency. Attributes of the upstream task may be used as + substitution values in the `artifact` or `dest` values of the + `fetches` entry. + """.lstrip() + ), + ): {str: [fetches_schema]}, + }, + Extra: object, + }, +) +"""Schema for from_deps transforms.""" + +transforms = TransformSequence() +transforms.add_validate(FROM_DEPS_SCHEMA) + + +@transforms.add +def from_deps(config, tasks): + for task in tasks: + # Setup and error handling. + from_deps = task.pop("from-deps") + kind_deps = config.config.get("kind-dependencies", []) + kinds = from_deps.get("kinds", kind_deps) + + invalid = set(kinds) - set(kind_deps) + if invalid: + invalid = "\n".join(sorted(invalid)) + raise Exception( + dedent( + f""" + The `from-deps.kinds` key contains the following kinds + that are not defined in `kind-dependencies`: + {invalid} + """.lstrip() + ) + ) + + if not kinds: + raise Exception( + dedent( + """ + The `from_deps` transforms require at least one kind defined + in `kind-dependencies`! + """.lstrip() + ) + ) + + # Resolve desired dependencies. + with_attributes = from_deps.get("with-attributes") + deps = [ + task + for task in config.kind_dependencies_tasks.values() + if task.kind in kinds + if not with_attributes or attrmatch(task.attributes, **with_attributes) + ] + + # Resolve groups. + group_by = from_deps.get("group-by", "single") + groups = set() + + if isinstance(group_by, dict): + assert len(group_by) == 1 + group_by, arg = group_by.popitem() + func = GROUP_BY_MAP[group_by] + if func.schema: + validate_schema( + func.schema, arg, f"Invalid group-by {group_by} argument" + ) + groups = func(config, deps, arg) + else: + func = GROUP_BY_MAP[group_by] + groups = func(config, deps) + + # Split the task, one per group. + set_name = from_deps.get("set-name", True) + copy_attributes = from_deps.get("copy-attributes", False) + unique_kinds = from_deps.get("unique-kinds", True) + fetches = from_deps.get("fetches", []) + for group in groups: + # Verify there is only one task per kind in each group. + group_kinds = {t.kind for t in group} + if unique_kinds and len(group_kinds) < len(group): + raise Exception( + "The from_deps transforms only allow a single task per kind in a group!" + ) + + new_task = deepcopy(task) + new_task.setdefault("dependencies", {}) + new_task["dependencies"].update( + {dep.kind if unique_kinds else dep.label: dep.label for dep in group} + ) + + # Set name and copy attributes from the primary kind. + for kind in kinds: + if kind in group_kinds: + primary_kind = kind + break + else: + raise Exception("Could not detect primary kind!") + + new_task.setdefault("attributes", {})[ + "primary-kind-dependency" + ] = primary_kind + + primary_dep = [dep for dep in group if dep.kind == primary_kind][0] + + if set_name: + if primary_dep.label.startswith(primary_kind): + new_task["name"] = primary_dep.label[len(primary_kind) + 1 :] + else: + new_task["name"] = primary_dep.label + + if copy_attributes: + attrs = new_task.setdefault("attributes", {}) + new_task["attributes"] = primary_dep.attributes.copy() + new_task["attributes"].update(attrs) + + if fetches: + task_fetches = new_task.setdefault("fetches", {}) + + for dep_task in get_dependencies(config, new_task): + # Nothing to do if this kind has no fetches listed + if dep_task.kind not in fetches: + continue + + fetches_from_dep = [] + for kind, kind_fetches in fetches.items(): + if kind != dep_task.kind: + continue + + for fetch in kind_fetches: + entry = fetch.copy() + entry["artifact"] = entry["artifact"].format( + **dep_task.attributes + ) + if "dest" in entry: + entry["dest"] = entry["dest"].format( + **dep_task.attributes + ) + fetches_from_dep.append(entry) + + task_fetches[dep_task.label] = fetches_from_dep + + yield new_task diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/__init__.py new file mode 100644 index 0000000000..06978ff46d --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/__init__.py @@ -0,0 +1,453 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Convert a job description into a task description. + +Jobs descriptions are similar to task descriptions, but they specify how to run +the job at a higher level, using a "run" field that can be interpreted by +run-using handlers in `taskcluster/taskgraph/transforms/job`. +""" + + +import copy +import json +import logging + +from voluptuous import Any, Exclusive, Extra, Optional, Required + +from taskgraph.transforms.base import TransformSequence +from taskgraph.transforms.cached_tasks import order_tasks +from taskgraph.transforms.task import task_description_schema +from taskgraph.util import path as mozpath +from taskgraph.util.python_path import import_sibling_modules +from taskgraph.util.schema import Schema, validate_schema +from taskgraph.util.taskcluster import get_artifact_prefix +from taskgraph.util.workertypes import worker_type_implementation + +logger = logging.getLogger(__name__) + +# Fetches may be accepted in other transforms and eventually passed along +# to a `job` (eg: from_deps). Defining this here allows them to re-use +# the schema and avoid duplication. +fetches_schema = { + Required("artifact"): str, + Optional("dest"): str, + Optional("extract"): bool, + Optional("verify-hash"): bool, +} + +# Schema for a build description +job_description_schema = Schema( + { + # The name of the job and the job's label. At least one must be specified, + # and the label will be generated from the name if necessary, by prepending + # the kind. + Optional("name"): str, + Optional("label"): str, + # the following fields are passed directly through to the task description, + # possibly modified by the run implementation. See + # taskcluster/taskgraph/transforms/task.py for the schema details. + Required("description"): task_description_schema["description"], + Optional("attributes"): task_description_schema["attributes"], + Optional("task-from"): task_description_schema["task-from"], + Optional("dependencies"): task_description_schema["dependencies"], + Optional("soft-dependencies"): task_description_schema["soft-dependencies"], + Optional("if-dependencies"): task_description_schema["if-dependencies"], + Optional("requires"): task_description_schema["requires"], + Optional("expires-after"): task_description_schema["expires-after"], + Optional("routes"): task_description_schema["routes"], + Optional("scopes"): task_description_schema["scopes"], + Optional("tags"): task_description_schema["tags"], + Optional("extra"): task_description_schema["extra"], + Optional("treeherder"): task_description_schema["treeherder"], + Optional("index"): task_description_schema["index"], + Optional("run-on-projects"): task_description_schema["run-on-projects"], + Optional("run-on-tasks-for"): task_description_schema["run-on-tasks-for"], + Optional("run-on-git-branches"): task_description_schema["run-on-git-branches"], + Optional("shipping-phase"): task_description_schema["shipping-phase"], + Optional("always-target"): task_description_schema["always-target"], + Exclusive("optimization", "optimization"): task_description_schema[ + "optimization" + ], + Optional("needs-sccache"): task_description_schema["needs-sccache"], + # The "when" section contains descriptions of the circumstances under which + # this task should be included in the task graph. This will be converted + # into an optimization, so it cannot be specified in a job description that + # also gives 'optimization'. + Exclusive("when", "optimization"): { + # This task only needs to be run if a file matching one of the given + # patterns has changed in the push. The patterns use the mozpack + # match function (python/mozbuild/mozpack/path.py). + Optional("files-changed"): [str], + }, + # A list of artifacts to install from 'fetch' tasks. + Optional("fetches"): { + Any("toolchain", "fetch"): [str], + str: [ + str, + fetches_schema, + ], + }, + # A description of how to run this job. + "run": { + # The key to a job implementation in a peer module to this one + "using": str, + # Base work directory used to set up the task. + Optional("workdir"): str, + # Any remaining content is verified against that job implementation's + # own schema. + Extra: object, + }, + Required("worker-type"): task_description_schema["worker-type"], + # This object will be passed through to the task description, with additions + # provided by the job's run-using function + Optional("worker"): dict, + } +) + +transforms = TransformSequence() +transforms.add_validate(job_description_schema) + + +@transforms.add +def rewrite_when_to_optimization(config, jobs): + for job in jobs: + when = job.pop("when", {}) + if not when: + yield job + continue + + files_changed = when.get("files-changed") + + # implicitly add task config directory. + files_changed.append(f"{config.path}/**") + + # "only when files changed" implies "skip if files have not changed" + job["optimization"] = {"skip-unless-changed": files_changed} + + assert "when" not in job + yield job + + +@transforms.add +def set_implementation(config, jobs): + for job in jobs: + impl, os = worker_type_implementation(config.graph_config, job["worker-type"]) + if os: + job.setdefault("tags", {})["os"] = os + if impl: + job.setdefault("tags", {})["worker-implementation"] = impl + worker = job.setdefault("worker", {}) + assert "implementation" not in worker + worker["implementation"] = impl + if os: + worker["os"] = os + yield job + + +@transforms.add +def set_label(config, jobs): + for job in jobs: + if "label" not in job: + if "name" not in job: + raise Exception("job has neither a name nor a label") + job["label"] = "{}-{}".format(config.kind, job["name"]) + if job.get("name"): + del job["name"] + yield job + + +@transforms.add +def add_resource_monitor(config, jobs): + for job in jobs: + if job.get("attributes", {}).get("resource-monitor"): + worker_implementation, worker_os = worker_type_implementation( + config.graph_config, job["worker-type"] + ) + # Normalise worker os so that linux-bitbar and similar use linux tools. + worker_os = worker_os.split("-")[0] + if "win7" in job["worker-type"]: + arch = "32" + else: + arch = "64" + job.setdefault("fetches", {}) + job["fetches"].setdefault("toolchain", []) + job["fetches"]["toolchain"].append(f"{worker_os}{arch}-resource-monitor") + + if worker_implementation == "docker-worker": + artifact_source = "/builds/worker/monitoring/resource-monitor.json" + else: + artifact_source = "monitoring/resource-monitor.json" + job["worker"].setdefault("artifacts", []) + job["worker"]["artifacts"].append( + { + "name": "public/monitoring/resource-monitor.json", + "type": "file", + "path": artifact_source, + } + ) + # Set env for output file + job["worker"].setdefault("env", {}) + job["worker"]["env"]["RESOURCE_MONITOR_OUTPUT"] = artifact_source + + yield job + + +def get_attribute(dict, key, attributes, attribute_name): + """Get `attribute_name` from the given `attributes` dict, and if there + is a corresponding value, set `key` in `dict` to that value.""" + value = attributes.get(attribute_name) + if value: + dict[key] = value + + +@transforms.add +def use_fetches(config, jobs): + artifact_names = {} + aliases = {} + extra_env = {} + + if config.kind in ("toolchain", "fetch"): + jobs = list(jobs) + for job in jobs: + run = job.get("run", {}) + label = job["label"] + get_attribute(artifact_names, label, run, "toolchain-artifact") + value = run.get(f"{config.kind}-alias") + if value: + aliases[f"{config.kind}-{value}"] = label + + for task in config.kind_dependencies_tasks.values(): + if task.kind in ("fetch", "toolchain"): + get_attribute( + artifact_names, + task.label, + task.attributes, + f"{task.kind}-artifact", + ) + get_attribute(extra_env, task.label, task.attributes, f"{task.kind}-env") + value = task.attributes.get(f"{task.kind}-alias") + if value: + aliases[f"{task.kind}-{value}"] = task.label + + artifact_prefixes = {} + for job in order_tasks(config, jobs): + artifact_prefixes[job["label"]] = get_artifact_prefix(job) + + fetches = job.pop("fetches", None) + if not fetches: + yield job + continue + + job_fetches = [] + name = job.get("name", job.get("label")) + dependencies = job.setdefault("dependencies", {}) + worker = job.setdefault("worker", {}) + env = worker.setdefault("env", {}) + prefix = get_artifact_prefix(job) + for kind in sorted(fetches): + artifacts = fetches[kind] + if kind in ("fetch", "toolchain"): + for fetch_name in sorted(artifacts): + label = f"{kind}-{fetch_name}" + label = aliases.get(label, label) + if label not in artifact_names: + raise Exception( + "Missing fetch job for {kind}-{name}: {fetch}".format( + kind=config.kind, name=name, fetch=fetch_name + ) + ) + if label in extra_env: + env.update(extra_env[label]) + + path = artifact_names[label] + + dependencies[label] = label + job_fetches.append( + { + "artifact": path, + "task": f"<{label}>", + "extract": True, + } + ) + else: + if kind not in dependencies: + raise Exception( + "{name} can't fetch {kind} artifacts because " + "it has no {kind} dependencies!".format(name=name, kind=kind) + ) + dep_label = dependencies[kind] + if dep_label in artifact_prefixes: + prefix = artifact_prefixes[dep_label] + else: + dep_tasks = [ + task + for label, task in config.kind_dependencies_tasks.items() + if label == dep_label + ] + if len(dep_tasks) != 1: + raise Exception( + "{name} can't fetch {kind} artifacts because " + "there are {tasks} with label {label} in kind dependencies!".format( + name=name, + kind=kind, + label=dependencies[kind], + tasks="no tasks" + if len(dep_tasks) == 0 + else "multiple tasks", + ) + ) + + prefix = get_artifact_prefix(dep_tasks[0]) + + def cmp_artifacts(a): + if isinstance(a, str): + return a + else: + return a["artifact"] + + for artifact in sorted(artifacts, key=cmp_artifacts): + if isinstance(artifact, str): + path = artifact + dest = None + extract = True + verify_hash = False + else: + path = artifact["artifact"] + dest = artifact.get("dest") + extract = artifact.get("extract", True) + verify_hash = artifact.get("verify-hash", False) + + fetch = { + "artifact": f"{prefix}/{path}", + "task": f"<{kind}>", + "extract": extract, + } + if dest is not None: + fetch["dest"] = dest + if verify_hash: + fetch["verify-hash"] = verify_hash + job_fetches.append(fetch) + + job_artifact_prefixes = { + mozpath.dirname(fetch["artifact"]) + for fetch in job_fetches + if not fetch["artifact"].startswith("public/") + } + if job_artifact_prefixes: + # Use taskcluster-proxy and request appropriate scope. For example, add + # 'scopes: [queue:get-artifact:path/to/*]' for 'path/to/artifact.tar.xz'. + worker["taskcluster-proxy"] = True + for prefix in sorted(job_artifact_prefixes): + scope = f"queue:get-artifact:{prefix}/*" + if scope not in job.setdefault("scopes", []): + job["scopes"].append(scope) + + env["MOZ_FETCHES"] = {"task-reference": json.dumps(job_fetches, sort_keys=True)} + + env.setdefault("MOZ_FETCHES_DIR", "fetches") + + yield job + + +@transforms.add +def make_task_description(config, jobs): + """Given a build description, create a task description""" + # import plugin modules first, before iterating over jobs + import_sibling_modules(exceptions=("common.py",)) + + for job in jobs: + # always-optimized tasks never execute, so have no workdir + if job["worker"]["implementation"] in ("docker-worker", "generic-worker"): + job["run"].setdefault("workdir", "/builds/worker") + + taskdesc = copy.deepcopy(job) + + # fill in some empty defaults to make run implementations easier + taskdesc.setdefault("attributes", {}) + taskdesc.setdefault("dependencies", {}) + taskdesc.setdefault("soft-dependencies", []) + taskdesc.setdefault("routes", []) + taskdesc.setdefault("scopes", []) + taskdesc.setdefault("extra", {}) + + # give the function for job.run.using on this worker implementation a + # chance to set up the task description. + configure_taskdesc_for_run( + config, job, taskdesc, job["worker"]["implementation"] + ) + del taskdesc["run"] + + # yield only the task description, discarding the job description + yield taskdesc + + +# A registry of all functions decorated with run_job_using +registry = {} + + +def run_job_using(worker_implementation, run_using, schema=None, defaults={}): + """Register the decorated function as able to set up a task description for + jobs with the given worker implementation and `run.using` property. If + `schema` is given, the job's run field will be verified to match it. + + The decorated function should have the signature `using_foo(config, job, taskdesc)` + and should modify the task description in-place. The skeleton of + the task description is already set up, but without a payload.""" + + def wrap(func): + for_run_using = registry.setdefault(run_using, {}) + if worker_implementation in for_run_using: + raise Exception( + "run_job_using({!r}, {!r}) already exists: {!r}".format( + run_using, + worker_implementation, + for_run_using[worker_implementation], + ) + ) + for_run_using[worker_implementation] = (func, schema, defaults) + return func + + return wrap + + +@run_job_using( + "always-optimized", "always-optimized", Schema({"using": "always-optimized"}) +) +def always_optimized(config, job, taskdesc): + pass + + +def configure_taskdesc_for_run(config, job, taskdesc, worker_implementation): + """ + Run the appropriate function for this job against the given task + description. + + This will raise an appropriate error if no function exists, or if the job's + run is not valid according to the schema. + """ + run_using = job["run"]["using"] + if run_using not in registry: + raise Exception(f"no functions for run.using {run_using!r}") + + if worker_implementation not in registry[run_using]: + raise Exception( + "no functions for run.using {!r} on {!r}".format( + run_using, worker_implementation + ) + ) + + func, schema, defaults = registry[run_using][worker_implementation] + for k, v in defaults.items(): + job["run"].setdefault(k, v) + + if schema: + validate_schema( + schema, + job["run"], + "In job.run using {!r}/{!r} for job {!r}:".format( + job["run"]["using"], worker_implementation, job["label"] + ), + ) + func(config, job, taskdesc) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/common.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/common.py new file mode 100644 index 0000000000..04708daf81 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/common.py @@ -0,0 +1,171 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Common support for various job types. These functions are all named after the +worker implementation they operate on, and take the same three parameters, for +consistency. +""" + + +import hashlib +import json + +from taskgraph.util.taskcluster import get_artifact_prefix + + +def get_vcsdir_name(os): + if os == "windows": + return "src" + else: + return "vcs" + + +def add_cache(job, taskdesc, name, mount_point, skip_untrusted=False): + """Adds a cache based on the worker's implementation. + + Args: + job (dict): Task's job description. + taskdesc (dict): Target task description to modify. + name (str): Name of the cache. + mount_point (path): Path on the host to mount the cache. + skip_untrusted (bool): Whether cache is used in untrusted environments + (default: False). Only applies to docker-worker. + """ + if not job["run"].get("use-caches", True): + return + + worker = job["worker"] + + if worker["implementation"] == "docker-worker": + taskdesc["worker"].setdefault("caches", []).append( + { + "type": "persistent", + "name": name, + "mount-point": mount_point, + "skip-untrusted": skip_untrusted, + } + ) + + elif worker["implementation"] == "generic-worker": + taskdesc["worker"].setdefault("mounts", []).append( + { + "cache-name": name, + "directory": mount_point, + } + ) + + else: + # Caches not implemented + pass + + +def add_artifacts(config, job, taskdesc, path): + taskdesc["worker"].setdefault("artifacts", []).append( + { + "name": get_artifact_prefix(taskdesc), + "path": path, + "type": "directory", + } + ) + + +def docker_worker_add_artifacts(config, job, taskdesc): + """Adds an artifact directory to the task""" + path = "{workdir}/artifacts/".format(**job["run"]) + taskdesc["worker"]["env"]["UPLOAD_DIR"] = path + add_artifacts(config, job, taskdesc, path) + + +def generic_worker_add_artifacts(config, job, taskdesc): + """Adds an artifact directory to the task""" + # The path is the location on disk; it doesn't necessarily + # mean the artifacts will be public or private; that is set via the name + # attribute in add_artifacts. + add_artifacts(config, job, taskdesc, path=get_artifact_prefix(taskdesc)) + + +def support_vcs_checkout(config, job, taskdesc, repo_configs, sparse=False): + """Update a job/task with parameters to enable a VCS checkout. + + This can only be used with ``run-task`` tasks, as the cache name is + reserved for ``run-task`` tasks. + """ + worker = job["worker"] + is_mac = worker["os"] == "macosx" + is_win = worker["os"] == "windows" + is_linux = worker["os"] == "linux" + is_docker = worker["implementation"] == "docker-worker" + assert is_mac or is_win or is_linux + + if is_win: + checkoutdir = "./build" + hgstore = "y:/hg-shared" + elif is_docker: + checkoutdir = "{workdir}/checkouts".format(**job["run"]) + hgstore = f"{checkoutdir}/hg-store" + else: + checkoutdir = "./checkouts" + hgstore = f"{checkoutdir}/hg-shared" + + vcsdir = checkoutdir + "/" + get_vcsdir_name(worker["os"]) + cache_name = "checkouts" + + # Robust checkout does not clean up subrepositories, so ensure that tasks + # that checkout different sets of paths have separate caches. + # See https://bugzilla.mozilla.org/show_bug.cgi?id=1631610 + if len(repo_configs) > 1: + checkout_paths = { + "\t".join([repo_config.path, repo_config.prefix]) + for repo_config in sorted( + repo_configs.values(), key=lambda repo_config: repo_config.path + ) + } + checkout_paths_str = "\n".join(checkout_paths).encode("utf-8") + digest = hashlib.sha256(checkout_paths_str).hexdigest() + cache_name += f"-repos-{digest}" + + # Sparse checkouts need their own cache because they can interfere + # with clients that aren't sparse aware. + if sparse: + cache_name += "-sparse" + + # Workers using Mercurial >= 5.8 will enable revlog-compression-zstd, which + # workers using older versions can't understand, so they can't share cache. + # At the moment, only docker workers use the newer version. + if is_docker: + cache_name += "-hg58" + + add_cache(job, taskdesc, cache_name, checkoutdir) + + env = taskdesc["worker"].setdefault("env", {}) + env.update( + { + "HG_STORE_PATH": hgstore, + "REPOSITORIES": json.dumps( + {repo.prefix: repo.name for repo in repo_configs.values()} + ), + "VCS_PATH": vcsdir, + } + ) + for repo_config in repo_configs.values(): + env.update( + { + f"{repo_config.prefix.upper()}_{key}": value + for key, value in { + "BASE_REPOSITORY": repo_config.base_repository, + "HEAD_REPOSITORY": repo_config.head_repository, + "HEAD_REV": repo_config.head_rev, + "HEAD_REF": repo_config.head_ref, + "REPOSITORY_TYPE": repo_config.type, + "SSH_SECRET_NAME": repo_config.ssh_secret_name, + }.items() + if value is not None + } + ) + if repo_config.ssh_secret_name: + taskdesc["scopes"].append(f"secrets:get:{repo_config.ssh_secret_name}") + + # only some worker platforms have taskcluster-proxy enabled + if job["worker"]["implementation"] in ("docker-worker",): + taskdesc["worker"]["taskcluster-proxy"] = True diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/index_search.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/index_search.py new file mode 100644 index 0000000000..09b48fe594 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/index_search.py @@ -0,0 +1,37 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +This transform allows including indexed tasks from other projects in the +current taskgraph. The transform takes a list of indexes, and the optimization +phase will replace the task with the task from the other graph. +""" + + +from voluptuous import Required + +from taskgraph.transforms.base import TransformSequence +from taskgraph.transforms.job import run_job_using +from taskgraph.util.schema import Schema + +transforms = TransformSequence() + +run_task_schema = Schema( + { + Required("using"): "index-search", + Required( + "index-search", + "A list of indexes in decreasing order of priority at which to lookup for this " + "task. This is interpolated with the graph parameters.", + ): [str], + } +) + + +@run_job_using("always-optimized", "index-search", schema=run_task_schema) +def fill_template(config, job, taskdesc): + run = job["run"] + taskdesc["optimization"] = { + "index-search": [index.format(**config.params) for index in run["index-search"]] + } diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/run_task.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/run_task.py new file mode 100644 index 0000000000..6337673611 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/run_task.py @@ -0,0 +1,231 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Support for running jobs that are invoked via the `run-task` script. +""" + +import dataclasses +import os + +from voluptuous import Any, Optional, Required + +from taskgraph.transforms.job import run_job_using +from taskgraph.transforms.job.common import support_vcs_checkout +from taskgraph.transforms.task import taskref_or_string +from taskgraph.util import path, taskcluster +from taskgraph.util.schema import Schema + +EXEC_COMMANDS = { + "bash": ["bash", "-cx"], + "powershell": ["powershell.exe", "-ExecutionPolicy", "Bypass"], +} + +run_task_schema = Schema( + { + Required("using"): "run-task", + # if true, add a cache at ~worker/.cache, which is where things like pip + # tend to hide their caches. This cache is never added for level-1 jobs. + # TODO Once bug 1526028 is fixed, this and 'use-caches' should be merged. + Required("cache-dotcache"): bool, + # Whether or not to use caches. + Optional("use-caches"): bool, + # if true (the default), perform a checkout on the worker + Required("checkout"): Any(bool, {str: dict}), + Optional( + "cwd", + description="Path to run command in. If a checkout is present, the path " + "to the checkout will be interpolated with the key `checkout`", + ): str, + # The sparse checkout profile to use. Value is the filename relative to the + # directory where sparse profiles are defined (build/sparse-profiles/). + Required("sparse-profile"): Any(str, None), + # The command arguments to pass to the `run-task` script, after the + # checkout arguments. If a list, it will be passed directly; otherwise + # it will be included in a single argument to the command specified by + # `exec-with`. + Required("command"): Any([taskref_or_string], taskref_or_string), + # What to execute the command with in the event command is a string. + Optional("exec-with"): Any(*list(EXEC_COMMANDS)), + # Command used to invoke the `run-task` script. Can be used if the script + # or Python installation is in a non-standard location on the workers. + Optional("run-task-command"): list, + # Base work directory used to set up the task. + Required("workdir"): str, + # Whether to run as root. (defaults to False) + Optional("run-as-root"): bool, + } +) + + +def common_setup(config, job, taskdesc, command): + run = job["run"] + if run["checkout"]: + repo_configs = config.repo_configs + if len(repo_configs) > 1 and run["checkout"] is True: + raise Exception("Must explicitly specify checkouts with multiple repos.") + elif run["checkout"] is not True: + repo_configs = { + repo: dataclasses.replace(repo_configs[repo], **config) + for (repo, config) in run["checkout"].items() + } + + support_vcs_checkout( + config, + job, + taskdesc, + repo_configs=repo_configs, + sparse=bool(run["sparse-profile"]), + ) + + vcs_path = taskdesc["worker"]["env"]["VCS_PATH"] + for repo_config in repo_configs.values(): + checkout_path = path.join(vcs_path, repo_config.path) + command.append(f"--{repo_config.prefix}-checkout={checkout_path}") + + if run["sparse-profile"]: + command.append( + "--{}-sparse-profile=build/sparse-profiles/{}".format( + repo_config.prefix, + run["sparse-profile"], + ) + ) + + if "cwd" in run: + run["cwd"] = path.normpath(run["cwd"].format(checkout=vcs_path)) + elif "cwd" in run and "{checkout}" in run["cwd"]: + raise Exception( + "Found `{{checkout}}` interpolation in `cwd` for task {name} " + "but the task doesn't have a checkout: {cwd}".format( + cwd=run["cwd"], name=job.get("name", job.get("label")) + ) + ) + + if "cwd" in run: + command.extend(("--task-cwd", run["cwd"])) + + taskdesc["worker"].setdefault("env", {})["MOZ_SCM_LEVEL"] = config.params["level"] + + +worker_defaults = { + "cache-dotcache": False, + "checkout": True, + "sparse-profile": None, + "run-as-root": False, +} + + +def script_url(config, script): + if "MOZ_AUTOMATION" in os.environ and "TASK_ID" not in os.environ: + raise Exception("TASK_ID must be defined to use run-task on generic-worker") + task_id = os.environ.get("TASK_ID", "<TASK_ID>") + # use_proxy = False to avoid having all generic-workers turn on proxy + # Assumes the cluster allows anonymous downloads of public artifacts + tc_url = taskcluster.get_root_url(False) + # TODO: Use util/taskcluster.py:get_artifact_url once hack for Bug 1405889 is removed + return f"{tc_url}/api/queue/v1/task/{task_id}/artifacts/public/{script}" + + +@run_job_using( + "docker-worker", "run-task", schema=run_task_schema, defaults=worker_defaults +) +def docker_worker_run_task(config, job, taskdesc): + run = job["run"] + worker = taskdesc["worker"] = job["worker"] + command = run.pop("run-task-command", ["/usr/local/bin/run-task"]) + common_setup(config, job, taskdesc, command) + + if run.get("cache-dotcache"): + worker["caches"].append( + { + "type": "persistent", + "name": "{project}-dotcache".format(**config.params), + "mount-point": "{workdir}/.cache".format(**run), + "skip-untrusted": True, + } + ) + + run_command = run["command"] + + # dict is for the case of `{'task-reference': str}`. + if isinstance(run_command, str) or isinstance(run_command, dict): + exec_cmd = EXEC_COMMANDS[run.pop("exec-with", "bash")] + run_command = exec_cmd + [run_command] + if run["run-as-root"]: + command.extend(("--user", "root", "--group", "root")) + command.append("--") + command.extend(run_command) + worker["command"] = command + + +@run_job_using( + "generic-worker", "run-task", schema=run_task_schema, defaults=worker_defaults +) +def generic_worker_run_task(config, job, taskdesc): + run = job["run"] + worker = taskdesc["worker"] = job["worker"] + is_win = worker["os"] == "windows" + is_mac = worker["os"] == "macosx" + is_bitbar = worker["os"] == "linux-bitbar" + + command = run.pop("run-task-command", None) + if not command: + if is_win: + command = ["C:/mozilla-build/python3/python3.exe", "run-task"] + elif is_mac: + command = ["/tools/python36/bin/python3", "run-task"] + else: + command = ["./run-task"] + + common_setup(config, job, taskdesc, command) + + worker.setdefault("mounts", []) + if run.get("cache-dotcache"): + worker["mounts"].append( + { + "cache-name": "{project}-dotcache".format(**config.params), + "directory": "{workdir}/.cache".format(**run), + } + ) + worker["mounts"].append( + { + "content": { + "url": script_url(config, "run-task"), + }, + "file": "./run-task", + } + ) + if worker.get("env", {}).get("MOZ_FETCHES"): + worker["mounts"].append( + { + "content": { + "url": script_url(config, "fetch-content"), + }, + "file": "./fetch-content", + } + ) + + run_command = run["command"] + + if isinstance(run_command, str): + if is_win: + run_command = f'"{run_command}"' + exec_cmd = EXEC_COMMANDS[run.pop("exec-with", "bash")] + run_command = exec_cmd + [run_command] + + if run["run-as-root"]: + command.extend(("--user", "root", "--group", "root")) + command.append("--") + if is_bitbar: + # Use the bitbar wrapper script which sets up the device and adb + # environment variables + command.append("/builds/taskcluster/script.py") + command.extend(run_command) + + if is_win: + worker["command"] = [" ".join(command)] + else: + worker["command"] = [ + ["chmod", "+x", "run-task"], + command, + ] diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/toolchain.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/toolchain.py new file mode 100644 index 0000000000..c9c09542ff --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/toolchain.py @@ -0,0 +1,175 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Support for running toolchain-building jobs via dedicated scripts +""" + +from voluptuous import ALLOW_EXTRA, Any, Optional, Required + +import taskgraph +from taskgraph.transforms.job import configure_taskdesc_for_run, run_job_using +from taskgraph.transforms.job.common import ( + docker_worker_add_artifacts, + generic_worker_add_artifacts, + get_vcsdir_name, +) +from taskgraph.util.hash import hash_paths +from taskgraph.util.schema import Schema +from taskgraph.util.shell import quote as shell_quote + +CACHE_TYPE = "toolchains.v3" + +toolchain_run_schema = Schema( + { + Required("using"): "toolchain-script", + # The script (in taskcluster/scripts/misc) to run. + Required("script"): str, + # Arguments to pass to the script. + Optional("arguments"): [str], + # Sparse profile to give to checkout using `run-task`. If given, + # a filename in `build/sparse-profiles`. Defaults to + # "toolchain-build", i.e., to + # `build/sparse-profiles/toolchain-build`. If `None`, instructs + # `run-task` to not use a sparse profile at all. + Required("sparse-profile"): Any(str, None), + # Paths/patterns pointing to files that influence the outcome of a + # toolchain build. + Optional("resources"): [str], + # Path to the artifact produced by the toolchain job + Required("toolchain-artifact"): str, + Optional( + "toolchain-alias", + description="An alias that can be used instead of the real toolchain job name in " + "fetch stanzas for jobs.", + ): Any(str, [str]), + Optional( + "toolchain-env", + description="Additional env variables to add to the worker when using this toolchain", + ): {str: object}, + # Base work directory used to set up the task. + Required("workdir"): str, + }, + extra=ALLOW_EXTRA, +) + + +def get_digest_data(config, run, taskdesc): + files = list(run.pop("resources", [])) + # The script + files.append("taskcluster/scripts/toolchain/{}".format(run["script"])) + + # Accumulate dependency hashes for index generation. + data = [hash_paths(config.graph_config.vcs_root, files)] + + data.append(taskdesc["attributes"]["toolchain-artifact"]) + + # If the task uses an in-tree docker image, we want it to influence + # the index path as well. Ideally, the content of the docker image itself + # should have an influence, but at the moment, we can't get that + # information here. So use the docker image name as a proxy. Not a lot of + # changes to docker images actually have an impact on the resulting + # toolchain artifact, so we'll just rely on such important changes to be + # accompanied with a docker image name change. + image = taskdesc["worker"].get("docker-image", {}).get("in-tree") + if image: + data.append(image) + + # Likewise script arguments should influence the index. + args = run.get("arguments") + if args: + data.extend(args) + return data + + +def common_toolchain(config, job, taskdesc, is_docker): + run = job["run"] + + worker = taskdesc["worker"] = job["worker"] + worker["chain-of-trust"] = True + + srcdir = get_vcsdir_name(worker["os"]) + + if is_docker: + # If the task doesn't have a docker-image, set a default + worker.setdefault("docker-image", {"in-tree": "toolchain-build"}) + + # Allow the job to specify where artifacts come from, but add + # public/build if it's not there already. + artifacts = worker.setdefault("artifacts", []) + if not any(artifact.get("name") == "public/build" for artifact in artifacts): + if is_docker: + docker_worker_add_artifacts(config, job, taskdesc) + else: + generic_worker_add_artifacts(config, job, taskdesc) + + env = worker["env"] + env.update( + { + "MOZ_BUILD_DATE": config.params["moz_build_date"], + "MOZ_SCM_LEVEL": config.params["level"], + } + ) + + attributes = taskdesc.setdefault("attributes", {}) + attributes["toolchain-artifact"] = run.pop("toolchain-artifact") + if "toolchain-alias" in run: + attributes["toolchain-alias"] = run.pop("toolchain-alias") + if "toolchain-env" in run: + attributes["toolchain-env"] = run.pop("toolchain-env") + + if not taskgraph.fast: + name = taskdesc["label"].replace(f"{config.kind}-", "", 1) + taskdesc["cache"] = { + "type": CACHE_TYPE, + "name": name, + "digest-data": get_digest_data(config, run, taskdesc), + } + + script = run.pop("script") + run["using"] = "run-task" + run["cwd"] = "{checkout}/.." + + if script.endswith(".ps1"): + run["exec-with"] = "powershell" + + command = [f"{srcdir}/taskcluster/scripts/toolchain/{script}"] + run.pop( + "arguments", [] + ) + + if not is_docker: + # Don't quote the first item in the command because it purposely contains + # an environment variable that is not meant to be quoted. + if len(command) > 1: + command = command[0] + " " + shell_quote(*command[1:]) + else: + command = command[0] + + run["command"] = command + + configure_taskdesc_for_run(config, job, taskdesc, worker["implementation"]) + + +toolchain_defaults = { + "sparse-profile": "toolchain-build", +} + + +@run_job_using( + "docker-worker", + "toolchain-script", + schema=toolchain_run_schema, + defaults=toolchain_defaults, +) +def docker_worker_toolchain(config, job, taskdesc): + common_toolchain(config, job, taskdesc, is_docker=True) + + +@run_job_using( + "generic-worker", + "toolchain-script", + schema=toolchain_run_schema, + defaults=toolchain_defaults, +) +def generic_worker_toolchain(config, job, taskdesc): + common_toolchain(config, job, taskdesc, is_docker=False) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/notify.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/notify.py new file mode 100644 index 0000000000..a61e7999c1 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/notify.py @@ -0,0 +1,195 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +Add notifications to tasks via Taskcluster's notify service. + +See https://docs.taskcluster.net/docs/reference/core/notify/usage for +more information. +""" +from voluptuous import ALLOW_EXTRA, Any, Exclusive, Optional, Required + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import Schema, optionally_keyed_by, resolve_keyed_by + +_status_type = Any( + "on-completed", + "on-defined", + "on-exception", + "on-failed", + "on-pending", + "on-resolved", + "on-running", +) + +_recipients = [ + { + Required("type"): "email", + Required("address"): optionally_keyed_by("project", "level", str), + Optional("status-type"): _status_type, + }, + { + Required("type"): "matrix-room", + Required("room-id"): str, + Optional("status-type"): _status_type, + }, + { + Required("type"): "pulse", + Required("routing-key"): str, + Optional("status-type"): _status_type, + }, + { + Required("type"): "slack-channel", + Required("channel-id"): str, + Optional("status-type"): _status_type, + }, +] + +_route_keys = { + "email": "address", + "matrix-room": "room-id", + "pulse": "routing-key", + "slack-channel": "channel-id", +} +"""Map each type to its primary key that will be used in the route.""" + +NOTIFY_SCHEMA = Schema( + { + Exclusive("notify", "config"): { + Required("recipients"): [Any(*_recipients)], + Optional("content"): { + Optional("email"): { + Optional("subject"): str, + Optional("content"): str, + Optional("link"): { + Required("text"): str, + Required("href"): str, + }, + }, + Optional("matrix"): { + Optional("body"): str, + Optional("formatted-body"): str, + Optional("format"): str, + Optional("msg-type"): str, + }, + Optional("slack"): { + Optional("text"): str, + Optional("blocks"): list, + Optional("attachments"): list, + }, + }, + }, + # Continue supporting the legacy schema for backwards compat. + Exclusive("notifications", "config"): { + Required("emails"): optionally_keyed_by("project", "level", [str]), + Required("subject"): str, + Optional("message"): str, + Optional("status-types"): [_status_type], + }, + }, + extra=ALLOW_EXTRA, +) +"""Notify schema.""" + +transforms = TransformSequence() +transforms.add_validate(NOTIFY_SCHEMA) + + +def _convert_legacy(config, legacy, label): + """Convert the legacy format to the new one.""" + notify = { + "recipients": [], + "content": {"email": {"subject": legacy["subject"]}}, + } + resolve_keyed_by( + legacy, + "emails", + label, + **{ + "level": config.params["level"], + "project": config.params["project"], + }, + ) + + status_types = legacy.get("status-types", ["on-completed"]) + for email in legacy["emails"]: + for status_type in status_types: + notify["recipients"].append( + {"type": "email", "address": email, "status-type": status_type} + ) + + notify["content"]["email"]["content"] = legacy.get("message", legacy["subject"]) + return notify + + +def _convert_content(content): + """Convert the notify content to Taskcluster's format. + + The Taskcluster notification format is described here: + https://docs.taskcluster.net/docs/reference/core/notify/usage + """ + tc = {} + if "email" in content: + tc["email"] = content.pop("email") + + for key, obj in content.items(): + for name in obj.keys(): + tc_name = "".join(part.capitalize() for part in name.split("-")) + tc[f"{key}{tc_name}"] = obj[name] + return tc + + +@transforms.add +def add_notifications(config, tasks): + for task in tasks: + label = "{}-{}".format(config.kind, task["name"]) + if "notifications" in task: + notify = _convert_legacy(config, task.pop("notifications"), label) + else: + notify = task.pop("notify", None) + + if not notify: + yield task + continue + + format_kwargs = dict( + task=task, + config=config.__dict__, + ) + + def substitute(ctx): + """Recursively find all strings in a simple nested dict (no lists), + and format them in-place using `format_kwargs`.""" + for key, val in ctx.items(): + if isinstance(val, str): + ctx[key] = val.format(**format_kwargs) + elif isinstance(val, dict): + ctx[key] = substitute(val) + return ctx + + task.setdefault("routes", []) + for recipient in notify["recipients"]: + type = recipient["type"] + recipient.setdefault("status-type", "on-completed") + substitute(recipient) + + if type == "email": + resolve_keyed_by( + recipient, + "address", + label, + **{ + "level": config.params["level"], + "project": config.params["project"], + }, + ) + + task["routes"].append( + f"notify.{type}.{recipient[_route_keys[type]]}.{recipient['status-type']}" + ) + + if "content" in notify: + task.setdefault("extra", {}).update( + {"notify": _convert_content(substitute(notify["content"]))} + ) + yield task diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/task.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/task.py new file mode 100644 index 0000000000..c55de78513 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/task.py @@ -0,0 +1,1375 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +""" +These transformations take a task description and turn it into a TaskCluster +task definition (along with attributes, label, etc.). The input to these +transformations is generic to any kind of task, but abstracts away some of the +complexities of worker implementations, scopes, and treeherder annotations. +""" + + +import hashlib +import os +import re +import time +from copy import deepcopy +from dataclasses import dataclass +from typing import Callable + +from voluptuous import All, Any, Extra, NotIn, Optional, Required + +from taskgraph import MAX_DEPENDENCIES +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.hash import hash_path +from taskgraph.util.keyed_by import evaluate_keyed_by +from taskgraph.util.memoize import memoize +from taskgraph.util.schema import ( + OptimizationSchema, + Schema, + optionally_keyed_by, + resolve_keyed_by, + taskref_or_string, + validate_schema, +) +from taskgraph.util.treeherder import split_symbol, treeherder_defaults +from taskgraph.util.workertypes import worker_type_implementation + +from ..util import docker as dockerutil +from ..util.workertypes import get_worker_type + +RUN_TASK = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "run-task", "run-task" +) + + +@memoize +def _run_task_suffix(): + """String to append to cache names under control of run-task.""" + return hash_path(RUN_TASK)[0:20] + + +# A task description is a general description of a TaskCluster task +task_description_schema = Schema( + { + # the label for this task + Required("label"): str, + # description of the task (for metadata) + Required("description"): str, + # attributes for this task + Optional("attributes"): {str: object}, + # relative path (from config.path) to the file task was defined in + Optional("task-from"): str, + # dependencies of this task, keyed by name; these are passed through + # verbatim and subject to the interpretation of the Task's get_dependencies + # method. + Optional("dependencies"): { + All( + str, + NotIn( + ["self", "decision"], + "Can't use 'self` or 'decision' as dependency names.", + ), + ): object, + }, + # Soft dependencies of this task, as a list of tasks labels + Optional("soft-dependencies"): [str], + # Dependencies that must be scheduled in order for this task to run. + Optional("if-dependencies"): [str], + Optional("requires"): Any("all-completed", "all-resolved"), + # expiration and deadline times, relative to task creation, with units + # (e.g., "14 days"). Defaults are set based on the project. + Optional("expires-after"): str, + Optional("deadline-after"): str, + # custom routes for this task; the default treeherder routes will be added + # automatically + Optional("routes"): [str], + # custom scopes for this task; any scopes required for the worker will be + # added automatically. The following parameters will be substituted in each + # scope: + # {level} -- the scm level of this push + # {project} -- the project of this push + Optional("scopes"): [str], + # Tags + Optional("tags"): {str: str}, + # custom "task.extra" content + Optional("extra"): {str: object}, + # treeherder-related information; see + # https://schemas.taskcluster.net/taskcluster-treeherder/v1/task-treeherder-config.json + # This may be provided in one of two ways: + # 1) A simple `true` will cause taskgraph to generate the required information + # 2) A dictionary with one or more of the required keys. Any key not present + # will use a default as described below. + # If not specified, no treeherder extra information or routes will be + # added to the task + Optional("treeherder"): Any( + True, + { + # either a bare symbol, or "grp(sym)". + # The default symbol is the uppercased first letter of each + # section of the kind (delimited by "-") all smooshed together. + # Eg: "test" becomes "T", "docker-image" becomes "DI", etc. + "symbol": Optional(str), + # the job kind + # If "build" or "test" is found in the kind name, this defaults + # to the appropriate value. Otherwise, defaults to "other" + "kind": Optional(Any("build", "test", "other")), + # tier for this task + # Defaults to 1 + "tier": Optional(int), + # task platform, in the form platform/collection, used to set + # treeherder.machine.platform and treeherder.collection or + # treeherder.labels + # Defaults to "default/opt" + "platform": Optional(str), + }, + ), + # information for indexing this build so its artifacts can be discovered; + # if omitted, the build will not be indexed. + Optional("index"): { + # the name of the product this build produces + "product": str, + # the names to use for this job in the TaskCluster index + "job-name": str, + # Type of gecko v2 index to use + "type": str, + # The rank that the task will receive in the TaskCluster + # index. A newly completed task supersedes the currently + # indexed task iff it has a higher rank. If unspecified, + # 'by-tier' behavior will be used. + "rank": Any( + # Rank is equal the timestamp of the build_date for tier-1 + # tasks, and zero for non-tier-1. This sorts tier-{2,3} + # builds below tier-1 in the index. + "by-tier", + # Rank is given as an integer constant (e.g. zero to make + # sure a task is last in the index). + int, + # Rank is equal to the timestamp of the build_date. This + # option can be used to override the 'by-tier' behavior + # for non-tier-1 tasks. + "build_date", + ), + }, + # The `run_on_projects` attribute, defaulting to "all". This dictates the + # projects on which this task should be included in the target task set. + # See the attributes documentation for details. + Optional("run-on-projects"): optionally_keyed_by("build-platform", [str]), + Optional("run-on-tasks-for"): [str], + Optional("run-on-git-branches"): [str], + # The `shipping_phase` attribute, defaulting to None. This specifies the + # release promotion phase that this task belongs to. + Optional("shipping-phase"): Any( + None, + "build", + "promote", + "push", + "ship", + ), + # The `always-target` attribute will cause the task to be included in the + # target_task_graph regardless of filtering. Tasks included in this manner + # will be candidates for optimization even when `optimize_target_tasks` is + # False, unless the task was also explicitly chosen by the target_tasks + # method. + Required("always-target"): bool, + # Optimization to perform on this task during the optimization phase. + # Optimizations are defined in taskcluster/taskgraph/optimize.py. + Required("optimization"): OptimizationSchema, + # the provisioner-id/worker-type for the task. The following parameters will + # be substituted in this string: + # {level} -- the scm level of this push + "worker-type": str, + # Whether the job should use sccache compiler caching. + Required("needs-sccache"): bool, + # information specific to the worker implementation that will run this task + Optional("worker"): { + Required("implementation"): str, + Extra: object, + }, + } +) + +TC_TREEHERDER_SCHEMA_URL = ( + "https://github.com/taskcluster/taskcluster-treeherder/" + "blob/master/schemas/task-treeherder-config.yml" +) + + +UNKNOWN_GROUP_NAME = ( + "Treeherder group {} (from {}) has no name; " "add it to taskcluster/ci/config.yml" +) + +V2_ROUTE_TEMPLATES = [ + "index.{trust-domain}.v2.{project}.latest.{product}.{job-name}", + "index.{trust-domain}.v2.{project}.pushdate.{build_date_long}.{product}.{job-name}", + "index.{trust-domain}.v2.{project}.pushlog-id.{pushlog_id}.{product}.{job-name}", + "index.{trust-domain}.v2.{project}.revision.{branch_rev}.{product}.{job-name}", +] + +# the roots of the treeherder routes +TREEHERDER_ROUTE_ROOT = "tc-treeherder" + + +def get_branch_rev(config): + return config.params["head_rev"] + + +@memoize +def get_default_priority(graph_config, project): + return evaluate_keyed_by( + graph_config["task-priority"], "Graph Config", {"project": project} + ) + + +@memoize +def get_default_deadline(graph_config, project): + return evaluate_keyed_by( + graph_config["task-deadline-after"], "Graph Config", {"project": project} + ) + + +# define a collection of payload builders, depending on the worker implementation +payload_builders = {} + + +@dataclass(frozen=True) +class PayloadBuilder: + schema: Schema + builder: Callable + + +def payload_builder(name, schema): + schema = Schema({Required("implementation"): name, Optional("os"): str}).extend( + schema + ) + + def wrap(func): + assert name not in payload_builders, f"duplicate payload builder name {name}" + payload_builders[name] = PayloadBuilder(schema, func) + return func + + return wrap + + +# define a collection of index builders, depending on the type implementation +index_builders = {} + + +def index_builder(name): + def wrap(func): + assert name not in index_builders, f"duplicate index builder name {name}" + index_builders[name] = func + return func + + return wrap + + +UNSUPPORTED_INDEX_PRODUCT_ERROR = """\ +The index product {product} is not in the list of configured products in +`taskcluster/ci/config.yml'. +""" + + +def verify_index(config, index): + product = index["product"] + if product not in config.graph_config["index"]["products"]: + raise Exception(UNSUPPORTED_INDEX_PRODUCT_ERROR.format(product=product)) + + +@payload_builder( + "docker-worker", + schema={ + Required("os"): "linux", + # For tasks that will run in docker-worker, this is the name of the docker + # image or in-tree docker image to run the task in. If in-tree, then a + # dependency will be created automatically. This is generally + # `desktop-test`, or an image that acts an awful lot like it. + Required("docker-image"): Any( + # a raw Docker image path (repo/image:tag) + str, + # an in-tree generated docker image (from `taskcluster/docker/<name>`) + {"in-tree": str}, + # an indexed docker image + {"indexed": str}, + ), + # worker features that should be enabled + Required("relengapi-proxy"): bool, + Required("chain-of-trust"): bool, + Required("taskcluster-proxy"): bool, + Required("allow-ptrace"): bool, + Required("loopback-video"): bool, + Required("loopback-audio"): bool, + Required("docker-in-docker"): bool, # (aka 'dind') + Required("privileged"): bool, + # Paths to Docker volumes. + # + # For in-tree Docker images, volumes can be parsed from Dockerfile. + # This only works for the Dockerfile itself: if a volume is defined in + # a base image, it will need to be declared here. Out-of-tree Docker + # images will also require explicit volume annotation. + # + # Caches are often mounted to the same path as Docker volumes. In this + # case, they take precedence over a Docker volume. But a volume still + # needs to be declared for the path. + Optional("volumes"): [str], + # caches to set up for the task + Optional("caches"): [ + { + # only one type is supported by any of the workers right now + "type": "persistent", + # name of the cache, allowing re-use by subsequent tasks naming the + # same cache + "name": str, + # location in the task image where the cache will be mounted + "mount-point": str, + # Whether the cache is not used in untrusted environments + # (like the Try repo). + Optional("skip-untrusted"): bool, + } + ], + # artifacts to extract from the task image after completion + Optional("artifacts"): [ + { + # type of artifact -- simple file, or recursive directory + "type": Any("file", "directory"), + # task image path from which to read artifact + "path": str, + # name of the produced artifact (root of the names for + # type=directory) + "name": str, + } + ], + # environment variables + Required("env"): {str: taskref_or_string}, + # the command to run; if not given, docker-worker will default to the + # command in the docker image + Optional("command"): [taskref_or_string], + # the maximum time to run, in seconds + Required("max-run-time"): int, + # the exit status code(s) that indicates the task should be retried + Optional("retry-exit-status"): [int], + # the exit status code(s) that indicates the caches used by the task + # should be purged + Optional("purge-caches-exit-status"): [int], + # Whether any artifacts are assigned to this worker + Optional("skip-artifacts"): bool, + }, +) +def build_docker_worker_payload(config, task, task_def): + worker = task["worker"] + level = int(config.params["level"]) + + image = worker["docker-image"] + if isinstance(image, dict): + if "in-tree" in image: + name = image["in-tree"] + docker_image_task = "build-docker-image-" + image["in-tree"] + task.setdefault("dependencies", {})["docker-image"] = docker_image_task + + image = { + "path": "public/image.tar.zst", + "taskId": {"task-reference": "<docker-image>"}, + "type": "task-image", + } + + # Find VOLUME in Dockerfile. + volumes = dockerutil.parse_volumes(name) + for v in sorted(volumes): + if v in worker["volumes"]: + raise Exception( + "volume %s already defined; " + "if it is defined in a Dockerfile, " + "it does not need to be specified in the " + "worker definition" % v + ) + + worker["volumes"].append(v) + + elif "indexed" in image: + image = { + "path": "public/image.tar.zst", + "namespace": image["indexed"], + "type": "indexed-image", + } + else: + raise Exception("unknown docker image type") + + features = {} + + if worker.get("relengapi-proxy"): + features["relengAPIProxy"] = True + + if worker.get("taskcluster-proxy"): + features["taskclusterProxy"] = True + + if worker.get("allow-ptrace"): + features["allowPtrace"] = True + task_def["scopes"].append("docker-worker:feature:allowPtrace") + + if worker.get("chain-of-trust"): + features["chainOfTrust"] = True + + if worker.get("docker-in-docker"): + features["dind"] = True + + if task.get("needs-sccache"): + features["taskclusterProxy"] = True + task_def["scopes"].append( + "assume:project:taskcluster:{trust_domain}:level-{level}-sccache-buckets".format( + trust_domain=config.graph_config["trust-domain"], + level=config.params["level"], + ) + ) + worker["env"]["USE_SCCACHE"] = "1" + # Disable sccache idle shutdown. + worker["env"]["SCCACHE_IDLE_TIMEOUT"] = "0" + else: + worker["env"]["SCCACHE_DISABLE"] = "1" + + capabilities = {} + + for lo in "audio", "video": + if worker.get("loopback-" + lo): + capitalized = "loopback" + lo.capitalize() + devices = capabilities.setdefault("devices", {}) + devices[capitalized] = True + task_def["scopes"].append("docker-worker:capability:device:" + capitalized) + + if worker.get("privileged"): + capabilities["privileged"] = True + task_def["scopes"].append("docker-worker:capability:privileged") + + task_def["payload"] = payload = { + "image": image, + "env": worker["env"], + } + if "command" in worker: + payload["command"] = worker["command"] + + if "max-run-time" in worker: + payload["maxRunTime"] = worker["max-run-time"] + + run_task = payload.get("command", [""])[0].endswith("run-task") + + # run-task exits EXIT_PURGE_CACHES if there is a problem with caches. + # Automatically retry the tasks and purge caches if we see this exit + # code. + # TODO move this closer to code adding run-task once bug 1469697 is + # addressed. + if run_task: + worker.setdefault("retry-exit-status", []).append(72) + worker.setdefault("purge-caches-exit-status", []).append(72) + + payload["onExitStatus"] = {} + if "retry-exit-status" in worker: + payload["onExitStatus"]["retry"] = worker["retry-exit-status"] + if "purge-caches-exit-status" in worker: + payload["onExitStatus"]["purgeCaches"] = worker["purge-caches-exit-status"] + + if "artifacts" in worker: + artifacts = {} + for artifact in worker["artifacts"]: + artifacts[artifact["name"]] = { + "path": artifact["path"], + "type": artifact["type"], + "expires": task_def["expires"], # always expire with the task + } + payload["artifacts"] = artifacts + + if isinstance(worker.get("docker-image"), str): + out_of_tree_image = worker["docker-image"] + else: + out_of_tree_image = None + image = worker.get("docker-image", {}).get("in-tree") + + if "caches" in worker: + caches = {} + + # run-task knows how to validate caches. + # + # To help ensure new run-task features and bug fixes don't interfere + # with existing caches, we seed the hash of run-task into cache names. + # So, any time run-task changes, we should get a fresh set of caches. + # This means run-task can make changes to cache interaction at any time + # without regards for backwards or future compatibility. + # + # But this mechanism only works for in-tree Docker images that are built + # with the current run-task! For out-of-tree Docker images, we have no + # way of knowing their content of run-task. So, in addition to varying + # cache names by the contents of run-task, we also take the Docker image + # name into consideration. This means that different Docker images will + # never share the same cache. This is a bit unfortunate. But it is the + # safest thing to do. Fortunately, most images are defined in-tree. + # + # For out-of-tree Docker images, we don't strictly need to incorporate + # the run-task content into the cache name. However, doing so preserves + # the mechanism whereby changing run-task results in new caches + # everywhere. + + # As an additional mechanism to force the use of different caches, the + # string literal in the variable below can be changed. This is + # preferred to changing run-task because it doesn't require images + # to be rebuilt. + cache_version = "v3" + + if run_task: + suffix = f"{cache_version}-{_run_task_suffix()}" + + if out_of_tree_image: + name_hash = hashlib.sha256( + out_of_tree_image.encode("utf-8") + ).hexdigest() + suffix += name_hash[0:12] + + else: + suffix = cache_version + + skip_untrusted = config.params.is_try() or level == 1 + + for cache in worker["caches"]: + # Some caches aren't enabled in environments where we can't + # guarantee certain behavior. Filter those out. + if cache.get("skip-untrusted") and skip_untrusted: + continue + + name = "{trust_domain}-level-{level}-{name}-{suffix}".format( + trust_domain=config.graph_config["trust-domain"], + level=config.params["level"], + name=cache["name"], + suffix=suffix, + ) + caches[name] = cache["mount-point"] + task_def["scopes"].append("docker-worker:cache:%s" % name) + + # Assertion: only run-task is interested in this. + if run_task: + payload["env"]["TASKCLUSTER_CACHES"] = ";".join(sorted(caches.values())) + + payload["cache"] = caches + + # And send down volumes information to run-task as well. + if run_task and worker.get("volumes"): + payload["env"]["TASKCLUSTER_VOLUMES"] = ";".join(sorted(worker["volumes"])) + + if payload.get("cache") and skip_untrusted: + payload["env"]["TASKCLUSTER_UNTRUSTED_CACHES"] = "1" + + if features: + payload["features"] = features + if capabilities: + payload["capabilities"] = capabilities + + check_caches_are_volumes(task) + + +@payload_builder( + "generic-worker", + schema={ + Required("os"): Any("windows", "macosx", "linux", "linux-bitbar"), + # see http://schemas.taskcluster.net/generic-worker/v1/payload.json + # and https://docs.taskcluster.net/reference/workers/generic-worker/payload + # command is a list of commands to run, sequentially + # on Windows, each command is a string, on OS X and Linux, each command is + # a string array + Required("command"): Any( + [taskref_or_string], [[taskref_or_string]] # Windows # Linux / OS X + ), + # artifacts to extract from the task image after completion; note that artifacts + # for the generic worker cannot have names + Optional("artifacts"): [ + { + # type of artifact -- simple file, or recursive directory + "type": Any("file", "directory"), + # filesystem path from which to read artifact + "path": str, + # if not specified, path is used for artifact name + Optional("name"): str, + } + ], + # Directories and/or files to be mounted. + # The actual allowed combinations are stricter than the model below, + # but this provides a simple starting point. + # See https://docs.taskcluster.net/reference/workers/generic-worker/payload + Optional("mounts"): [ + { + # A unique name for the cache volume, implies writable cache directory + # (otherwise mount is a read-only file or directory). + Optional("cache-name"): str, + # Optional content for pre-loading cache, or mandatory content for + # read-only file or directory. Pre-loaded content can come from either + # a task artifact or from a URL. + Optional("content"): { + # *** Either (artifact and task-id) or url must be specified. *** + # Artifact name that contains the content. + Optional("artifact"): str, + # Task ID that has the artifact that contains the content. + Optional("task-id"): taskref_or_string, + # URL that supplies the content in response to an unauthenticated + # GET request. + Optional("url"): str, + }, + # *** Either file or directory must be specified. *** + # If mounting a cache or read-only directory, the filesystem location of + # the directory should be specified as a relative path to the task + # directory here. + Optional("directory"): str, + # If mounting a file, specify the relative path within the task + # directory to mount the file (the file will be read only). + Optional("file"): str, + # Required if and only if `content` is specified and mounting a + # directory (not a file). This should be the archive format of the + # content (either pre-loaded cache or read-only directory). + Optional("format"): Any("rar", "tar.bz2", "tar.gz", "zip"), + } + ], + # environment variables + Required("env"): {str: taskref_or_string}, + # the maximum time to run, in seconds + Required("max-run-time"): int, + # the exit status code(s) that indicates the task should be retried + Optional("retry-exit-status"): [int], + # the exit status code(s) that indicates the caches used by the task + # should be purged + Optional("purge-caches-exit-status"): [int], + # os user groups for test task workers + Optional("os-groups"): [str], + # feature for test task to run as administarotr + Optional("run-as-administrator"): bool, + # optional features + Required("chain-of-trust"): bool, + Optional("taskcluster-proxy"): bool, + # Whether any artifacts are assigned to this worker + Optional("skip-artifacts"): bool, + }, +) +def build_generic_worker_payload(config, task, task_def): + worker = task["worker"] + + task_def["payload"] = { + "command": worker["command"], + "maxRunTime": worker["max-run-time"], + } + + on_exit_status = {} + if "retry-exit-status" in worker: + on_exit_status["retry"] = worker["retry-exit-status"] + if "purge-caches-exit-status" in worker: + on_exit_status["purgeCaches"] = worker["purge-caches-exit-status"] + if worker["os"] == "windows": + on_exit_status.setdefault("retry", []).extend( + [ + # These codes (on windows) indicate a process interruption, + # rather than a task run failure. See bug 1544403. + 1073807364, # process force-killed due to system shutdown + 3221225786, # sigint (any interrupt) + ] + ) + if on_exit_status: + task_def["payload"]["onExitStatus"] = on_exit_status + + env = worker.get("env", {}) + + if task.get("needs-sccache"): + env["USE_SCCACHE"] = "1" + # Disable sccache idle shutdown. + env["SCCACHE_IDLE_TIMEOUT"] = "0" + else: + env["SCCACHE_DISABLE"] = "1" + + if env: + task_def["payload"]["env"] = env + + artifacts = [] + + for artifact in worker.get("artifacts", []): + a = { + "path": artifact["path"], + "type": artifact["type"], + } + if "name" in artifact: + a["name"] = artifact["name"] + artifacts.append(a) + + if artifacts: + task_def["payload"]["artifacts"] = artifacts + + # Need to copy over mounts, but rename keys to respect naming convention + # * 'cache-name' -> 'cacheName' + # * 'task-id' -> 'taskId' + # All other key names are already suitable, and don't need renaming. + mounts = deepcopy(worker.get("mounts", [])) + for mount in mounts: + if "cache-name" in mount: + mount["cacheName"] = "{trust_domain}-level-{level}-{name}".format( + trust_domain=config.graph_config["trust-domain"], + level=config.params["level"], + name=mount.pop("cache-name"), + ) + task_def["scopes"].append( + "generic-worker:cache:{}".format(mount["cacheName"]) + ) + if "content" in mount: + if "task-id" in mount["content"]: + mount["content"]["taskId"] = mount["content"].pop("task-id") + if "artifact" in mount["content"]: + if not mount["content"]["artifact"].startswith("public/"): + task_def["scopes"].append( + "queue:get-artifact:{}".format(mount["content"]["artifact"]) + ) + + if mounts: + task_def["payload"]["mounts"] = mounts + + if worker.get("os-groups"): + task_def["payload"]["osGroups"] = worker["os-groups"] + task_def["scopes"].extend( + [ + "generic-worker:os-group:{}/{}".format(task["worker-type"], group) + for group in worker["os-groups"] + ] + ) + + features = {} + + if worker.get("chain-of-trust"): + features["chainOfTrust"] = True + + if worker.get("taskcluster-proxy"): + features["taskclusterProxy"] = True + + if worker.get("run-as-administrator", False): + features["runAsAdministrator"] = True + task_def["scopes"].append( + "generic-worker:run-as-administrator:{}".format(task["worker-type"]), + ) + + if features: + task_def["payload"]["features"] = features + + +@payload_builder( + "beetmover", + schema={ + # the maximum time to run, in seconds + Required("max-run-time"): int, + # locale key, if this is a locale beetmover job + Optional("locale"): str, + Optional("partner-public"): bool, + Required("release-properties"): { + "app-name": str, + "app-version": str, + "branch": str, + "build-id": str, + "hash-type": str, + "platform": str, + }, + # list of artifact URLs for the artifacts that should be beetmoved + Required("upstream-artifacts"): [ + { + # taskId of the task with the artifact + Required("taskId"): taskref_or_string, + # type of signing task (for CoT) + Required("taskType"): str, + # Paths to the artifacts to sign + Required("paths"): [str], + # locale is used to map upload path and allow for duplicate simple names + Required("locale"): str, + } + ], + Optional("artifact-map"): object, + }, +) +def build_beetmover_payload(config, task, task_def): + worker = task["worker"] + release_properties = worker["release-properties"] + + task_def["payload"] = { + "maxRunTime": worker["max-run-time"], + "releaseProperties": { + "appName": release_properties["app-name"], + "appVersion": release_properties["app-version"], + "branch": release_properties["branch"], + "buildid": release_properties["build-id"], + "hashType": release_properties["hash-type"], + "platform": release_properties["platform"], + }, + "upload_date": config.params["build_date"], + "upstreamArtifacts": worker["upstream-artifacts"], + } + if worker.get("locale"): + task_def["payload"]["locale"] = worker["locale"] + if worker.get("artifact-map"): + task_def["payload"]["artifactMap"] = worker["artifact-map"] + if worker.get("partner-public"): + task_def["payload"]["is_partner_repack_public"] = worker["partner-public"] + + +@payload_builder( + "invalid", + schema={ + # an invalid task is one which should never actually be created; this is used in + # release automation on branches where the task just doesn't make sense + Extra: object, + }, +) +def build_invalid_payload(config, task, task_def): + task_def["payload"] = "invalid task - should never be created" + + +@payload_builder( + "always-optimized", + schema={ + Extra: object, + }, +) +@payload_builder("succeed", schema={}) +def build_dummy_payload(config, task, task_def): + task_def["payload"] = {} + + +transforms = TransformSequence() + + +@transforms.add +def set_implementation(config, tasks): + """ + Set the worker implementation based on the worker-type alias. + """ + for task in tasks: + worker = task.setdefault("worker", {}) + if "implementation" in task["worker"]: + yield task + continue + + impl, os = worker_type_implementation(config.graph_config, task["worker-type"]) + + tags = task.setdefault("tags", {}) + tags["worker-implementation"] = impl + if os: + task["tags"]["os"] = os + worker["implementation"] = impl + if os: + worker["os"] = os + + yield task + + +@transforms.add +def set_defaults(config, tasks): + for task in tasks: + task.setdefault("always-target", False) + task.setdefault("optimization", None) + task.setdefault("needs-sccache", False) + + worker = task["worker"] + if worker["implementation"] in ("docker-worker",): + worker.setdefault("relengapi-proxy", False) + worker.setdefault("chain-of-trust", False) + worker.setdefault("taskcluster-proxy", False) + worker.setdefault("allow-ptrace", False) + worker.setdefault("loopback-video", False) + worker.setdefault("loopback-audio", False) + worker.setdefault("docker-in-docker", False) + worker.setdefault("privileged", False) + worker.setdefault("volumes", []) + worker.setdefault("env", {}) + if "caches" in worker: + for c in worker["caches"]: + c.setdefault("skip-untrusted", False) + elif worker["implementation"] == "generic-worker": + worker.setdefault("env", {}) + worker.setdefault("os-groups", []) + if worker["os-groups"] and worker["os"] != "windows": + raise Exception( + "os-groups feature of generic-worker is only supported on " + "Windows, not on {}".format(worker["os"]) + ) + worker.setdefault("chain-of-trust", False) + elif worker["implementation"] in ( + "scriptworker-signing", + "beetmover", + "beetmover-push-to-release", + "beetmover-maven", + ): + worker.setdefault("max-run-time", 600) + elif worker["implementation"] == "push-apk": + worker.setdefault("commit", False) + + yield task + + +@transforms.add +def task_name_from_label(config, tasks): + for task in tasks: + if "label" not in task: + if "name" not in task: + raise Exception("task has neither a name nor a label") + task["label"] = "{}-{}".format(config.kind, task["name"]) + if task.get("name"): + del task["name"] + yield task + + +@transforms.add +def validate(config, tasks): + for task in tasks: + validate_schema( + task_description_schema, + task, + "In task {!r}:".format(task.get("label", "?no-label?")), + ) + validate_schema( + payload_builders[task["worker"]["implementation"]].schema, + task["worker"], + "In task.run {!r}:".format(task.get("label", "?no-label?")), + ) + yield task + + +@index_builder("generic") +def add_generic_index_routes(config, task): + index = task.get("index") + routes = task.setdefault("routes", []) + + verify_index(config, index) + + subs = config.params.copy() + subs["job-name"] = index["job-name"] + subs["build_date_long"] = time.strftime( + "%Y.%m.%d.%Y%m%d%H%M%S", time.gmtime(config.params["build_date"]) + ) + subs["product"] = index["product"] + subs["trust-domain"] = config.graph_config["trust-domain"] + subs["branch_rev"] = get_branch_rev(config) + + for tpl in V2_ROUTE_TEMPLATES: + routes.append(tpl.format(**subs)) + + return task + + +@transforms.add +def process_treeherder_metadata(config, tasks): + for task in tasks: + routes = task.get("routes", []) + extra = task.get("extra", {}) + task_th = task.get("treeherder") + + if task_th: + # This `merged_th` object is just an intermediary that combines + # the defaults and whatever is in the task. Ultimately, the task + # transforms this data a bit in the `treeherder` object that is + # eventually set in the task. + merged_th = treeherder_defaults(config.kind, task["label"]) + if isinstance(task_th, dict): + merged_th.update(task_th) + + treeherder = extra.setdefault("treeherder", {}) + extra.setdefault("treeherder-platform", merged_th["platform"]) + + machine_platform, collection = merged_th["platform"].split("/", 1) + treeherder["machine"] = {"platform": machine_platform} + treeherder["collection"] = {collection: True} + + group_names = config.graph_config["treeherder"]["group-names"] + groupSymbol, symbol = split_symbol(merged_th["symbol"]) + if groupSymbol != "?": + treeherder["groupSymbol"] = groupSymbol + if groupSymbol not in group_names: + path = os.path.join(config.path, task.get("task-from", "")) + raise Exception(UNKNOWN_GROUP_NAME.format(groupSymbol, path)) + treeherder["groupName"] = group_names[groupSymbol] + treeherder["symbol"] = symbol + if len(symbol) > 25 or len(groupSymbol) > 25: + raise RuntimeError( + "Treeherder group and symbol names must not be longer than " + "25 characters: {} (see {})".format( + treeherder["symbol"], + TC_TREEHERDER_SCHEMA_URL, + ) + ) + treeherder["jobKind"] = merged_th["kind"] + treeherder["tier"] = merged_th["tier"] + + branch_rev = get_branch_rev(config) + + if config.params["tasks_for"].startswith("github-pull-request"): + # In the past we used `project` for this, but that ends up being + # set to the repository name of the _head_ repo, which is not correct + # (and causes scope issues) if it doesn't match the name of the + # base repo + base_project = config.params["base_repository"].split("/")[-1] + if base_project.endswith(".git"): + base_project = base_project[:-4] + th_project_suffix = "-pr" + else: + base_project = config.params["project"] + th_project_suffix = "" + + routes.append( + "{}.v2.{}.{}.{}".format( + TREEHERDER_ROUTE_ROOT, + base_project + th_project_suffix, + branch_rev, + config.params["pushlog_id"], + ) + ) + + task["routes"] = routes + task["extra"] = extra + yield task + + +@transforms.add +def add_index_routes(config, tasks): + for task in tasks: + index = task.get("index", {}) + + # The default behavior is to rank tasks according to their tier + extra_index = task.setdefault("extra", {}).setdefault("index", {}) + rank = index.get("rank", "by-tier") + + if rank == "by-tier": + # rank is zero for non-tier-1 tasks and based on pushid for others; + # this sorts tier-{2,3} builds below tier-1 in the index + tier = task.get("extra", {}).get("treeherder", {}).get("tier", 3) + extra_index["rank"] = 0 if tier > 1 else int(config.params["build_date"]) + elif rank == "build_date": + extra_index["rank"] = int(config.params["build_date"]) + else: + extra_index["rank"] = rank + + if not index: + yield task + continue + + index_type = index.get("type", "generic") + if index_type not in index_builders: + raise ValueError(f"Unknown index-type {index_type}") + task = index_builders[index_type](config, task) + + del task["index"] + yield task + + +@transforms.add +def build_task(config, tasks): + for task in tasks: + level = str(config.params["level"]) + + provisioner_id, worker_type = get_worker_type( + config.graph_config, + task["worker-type"], + level, + ) + task["worker-type"] = "/".join([provisioner_id, worker_type]) + project = config.params["project"] + + routes = task.get("routes", []) + scopes = [ + s.format(level=level, project=project) for s in task.get("scopes", []) + ] + + # set up extra + extra = task.get("extra", {}) + extra["parent"] = os.environ.get("TASK_ID", "") + + if "expires-after" not in task: + task["expires-after"] = "28 days" if config.params.is_try() else "1 year" + + if "deadline-after" not in task: + if "task-deadline-after" in config.graph_config: + task["deadline-after"] = get_default_deadline( + config.graph_config, config.params["project"] + ) + else: + task["deadline-after"] = "1 day" + + if "priority" not in task: + task["priority"] = get_default_priority( + config.graph_config, config.params["project"] + ) + + tags = task.get("tags", {}) + tags.update( + { + "createdForUser": config.params["owner"], + "kind": config.kind, + "label": task["label"], + } + ) + + task_def = { + "provisionerId": provisioner_id, + "workerType": worker_type, + "routes": routes, + "created": {"relative-datestamp": "0 seconds"}, + "deadline": {"relative-datestamp": task["deadline-after"]}, + "expires": {"relative-datestamp": task["expires-after"]}, + "scopes": scopes, + "metadata": { + "description": task["description"], + "name": task["label"], + "owner": config.params["owner"], + "source": config.params.file_url(config.path, pretty=True), + }, + "extra": extra, + "tags": tags, + "priority": task["priority"], + } + + if task.get("requires", None): + task_def["requires"] = task["requires"] + + if task.get("extra", {}).get("treeherder"): + branch_rev = get_branch_rev(config) + if config.params["tasks_for"].startswith("github-pull-request"): + # In the past we used `project` for this, but that ends up being + # set to the repository name of the _head_ repo, which is not correct + # (and causes scope issues) if it doesn't match the name of the + # base repo + base_project = config.params["base_repository"].split("/")[-1] + if base_project.endswith(".git"): + base_project = base_project[:-4] + th_project_suffix = "-pr" + else: + base_project = config.params["project"] + th_project_suffix = "" + + # link back to treeherder in description + th_push_link = ( + "https://treeherder.mozilla.org/#/jobs?repo={}&revision={}".format( + config.params["project"] + th_project_suffix, branch_rev + ) + ) + task_def["metadata"]["description"] += " ([Treeherder push]({}))".format( + th_push_link + ) + + # add the payload and adjust anything else as required (e.g., scopes) + payload_builders[task["worker"]["implementation"]].builder( + config, task, task_def + ) + + attributes = task.get("attributes", {}) + # Resolve run-on-projects + build_platform = attributes.get("build_platform") + resolve_keyed_by( + task, + "run-on-projects", + item_name=task["label"], + **{"build-platform": build_platform}, + ) + attributes["run_on_projects"] = task.get("run-on-projects", ["all"]) + attributes["run_on_tasks_for"] = task.get("run-on-tasks-for", ["all"]) + # We don't want to pollute non git repos with this attribute. Moreover, target_tasks + # already assumes the default value is ['all'] + if task.get("run-on-git-branches"): + attributes["run_on_git_branches"] = task["run-on-git-branches"] + + attributes["always_target"] = task["always-target"] + # This logic is here since downstream tasks don't always match their + # upstream dependency's shipping_phase. + # A text_type task['shipping-phase'] takes precedence, then + # an existing attributes['shipping_phase'], then fall back to None. + if task.get("shipping-phase") is not None: + attributes["shipping_phase"] = task["shipping-phase"] + else: + attributes.setdefault("shipping_phase", None) + + # Set MOZ_AUTOMATION on all jobs. + if task["worker"]["implementation"] in ( + "generic-worker", + "docker-worker", + ): + payload = task_def.get("payload") + if payload: + env = payload.setdefault("env", {}) + env["MOZ_AUTOMATION"] = "1" + + dependencies = task.get("dependencies", {}) + if_dependencies = task.get("if-dependencies", []) + if if_dependencies: + for i, dep in enumerate(if_dependencies): + if dep in dependencies: + if_dependencies[i] = dependencies[dep] + continue + + raise Exception( + "{label} specifies '{dep}' in if-dependencies, " + "but {dep} is not a dependency!".format( + label=task["label"], dep=dep + ) + ) + + yield { + "label": task["label"], + "description": task["description"], + "task": task_def, + "dependencies": dependencies, + "if-dependencies": if_dependencies, + "soft-dependencies": task.get("soft-dependencies", []), + "attributes": attributes, + "optimization": task.get("optimization", None), + } + + +@transforms.add +def add_github_checks(config, tasks): + """ + For git repositories, add checks route to all tasks. + + This will be replaced by a configurable option in the future. + """ + if config.params["repository_type"] != "git": + for task in tasks: + yield task + + for task in tasks: + task["task"]["routes"].append("checks") + yield task + + +@transforms.add +def chain_of_trust(config, tasks): + for task in tasks: + if task["task"].get("payload", {}).get("features", {}).get("chainOfTrust"): + image = task.get("dependencies", {}).get("docker-image") + if image: + cot = ( + task["task"].setdefault("extra", {}).setdefault("chainOfTrust", {}) + ) + cot.setdefault("inputs", {})["docker-image"] = { + "task-reference": "<docker-image>" + } + yield task + + +@transforms.add +def check_task_identifiers(config, tasks): + """Ensures that all tasks have well defined identifiers: + ``^[a-zA-Z0-9_-]{1,38}$`` + """ + e = re.compile("^[a-zA-Z0-9_-]{1,38}$") + for task in tasks: + for attrib in ("workerType", "provisionerId"): + if not e.match(task["task"][attrib]): + raise Exception( + "task {}.{} is not a valid identifier: {}".format( + task["label"], attrib, task["task"][attrib] + ) + ) + yield task + + +@transforms.add +def check_task_dependencies(config, tasks): + """Ensures that tasks don't have more than 100 dependencies.""" + for task in tasks: + number_of_dependencies = ( + len(task["dependencies"]) + + len(task["if-dependencies"]) + + len(task["soft-dependencies"]) + ) + if number_of_dependencies > MAX_DEPENDENCIES: + raise Exception( + "task {}/{} has too many dependencies ({} > {})".format( + config.kind, + task["label"], + number_of_dependencies, + MAX_DEPENDENCIES, + ) + ) + yield task + + +def check_caches_are_volumes(task): + """Ensures that all cache paths are defined as volumes. + + Caches and volumes are the only filesystem locations whose content + isn't defined by the Docker image itself. Some caches are optional + depending on the job environment. We want paths that are potentially + caches to have as similar behavior regardless of whether a cache is + used. To help enforce this, we require that all paths used as caches + to be declared as Docker volumes. This check won't catch all offenders. + But it is better than nothing. + """ + volumes = set(task["worker"]["volumes"]) + paths = {c["mount-point"] for c in task["worker"].get("caches", [])} + missing = paths - volumes + + if not missing: + return + + raise Exception( + "task {} (image {}) has caches that are not declared as " + "Docker volumes: {} " + "(have you added them as VOLUMEs in the Dockerfile?)".format( + task["label"], task["worker"]["docker-image"], ", ".join(sorted(missing)) + ) + ) + + +@transforms.add +def check_run_task_caches(config, tasks): + """Audit for caches requiring run-task. + + run-task manages caches in certain ways. If a cache managed by run-task + is used by a non run-task task, it could cause problems. So we audit for + that and make sure certain cache names are exclusive to run-task. + + IF YOU ARE TEMPTED TO MAKE EXCLUSIONS TO THIS POLICY, YOU ARE LIKELY + CONTRIBUTING TECHNICAL DEBT AND WILL HAVE TO SOLVE MANY OF THE PROBLEMS + THAT RUN-TASK ALREADY SOLVES. THINK LONG AND HARD BEFORE DOING THAT. + """ + re_reserved_caches = re.compile( + """^ + (checkouts|tooltool-cache) + """, + re.VERBOSE, + ) + + cache_prefix = "{trust_domain}-level-{level}-".format( + trust_domain=config.graph_config["trust-domain"], + level=config.params["level"], + ) + + suffix = _run_task_suffix() + + for task in tasks: + payload = task["task"].get("payload", {}) + command = payload.get("command") or [""] + + main_command = command[0] if isinstance(command[0], str) else "" + run_task = main_command.endswith("run-task") + + for cache in payload.get("cache", {}): + if not cache.startswith(cache_prefix): + raise Exception( + "{} is using a cache ({}) which is not appropriate " + "for its trust-domain and level. It should start with {}.".format( + task["label"], cache, cache_prefix + ) + ) + + cache = cache[len(cache_prefix) :] + + if not re_reserved_caches.match(cache): + continue + + if not run_task: + raise Exception( + f"{task['label']} is using a cache ({cache}) reserved for run-task " + "change the task to use run-task or use a different " + "cache name" + ) + + if not cache.endswith(suffix): + raise Exception( + f"{task['label']} is using a cache ({cache}) reserved for run-task " + "but the cache name is not dependent on the contents " + "of run-task; change the cache name to conform to the " + "naming requirements" + ) + + yield task diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/transforms/task_context.py b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/task_context.py new file mode 100644 index 0000000000..5c7ed6af80 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/transforms/task_context.py @@ -0,0 +1,121 @@ +from textwrap import dedent + +from voluptuous import ALLOW_EXTRA, Any, Optional, Required + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import Schema +from taskgraph.util.templates import deep_get, substitute +from taskgraph.util.yaml import load_yaml + +SCHEMA = Schema( + { + Required( + "task-context", + description=dedent( + """ + `task-context` can be used to substitute values into any field in a + task with data that is not known until `taskgraph` runs. + + This data can be provided via `from-parameters` or `from-file`, + which can pull in values from parameters and a defined yml file + respectively. + + Data may also be provided directly in the `from-object` section of + `task-context`. This can be useful in `kinds` that define most of + their contents in `task-defaults`, but have some values that may + differ for various concrete `tasks` in the `kind`. + + If the same key is found in multiple places the order of precedence + is as follows: + - Parameters + - `from-object` keys + - File + + That is to say: parameters will always override anything else. + + """.lstrip(), + ), + ): { + Optional( + "from-parameters", + description=dedent( + """ + Retrieve task context values from parameters. A single + parameter may be provided or a list of parameters in + priority order. The latter can be useful in implementing a + "default" value if some other parameter is not provided. + """.lstrip() + ), + ): {str: Any([str], str)}, + Optional( + "from-file", + description=dedent( + """ + Retrieve task context values from a yaml file. The provided + file should usually only contain top level keys and values + (eg: nested objects will not be interpolated - they will be + substituted as text representations of the object). + """.lstrip() + ), + ): str, + Optional( + "from-object", + description="Key/value pairs to be used as task context", + ): object, + Required( + "substitution-fields", + description=dedent( + """ + A list of fields in the task to substitute the provided values + into. + """.lstrip() + ), + ): [str], + }, + }, + extra=ALLOW_EXTRA, +) + +transforms = TransformSequence() +transforms.add_validate(SCHEMA) + + +@transforms.add +def render_task(config, jobs): + for job in jobs: + sub_config = job.pop("task-context") + params_context = {} + for var, path in sub_config.pop("from-parameters", {}).items(): + if isinstance(path, str): + params_context[var] = deep_get(config.params, path) + else: + for choice in path: + value = deep_get(config.params, choice) + if value is not None: + params_context[var] = value + break + + file_context = {} + from_file = sub_config.pop("from-file", None) + if from_file: + file_context = load_yaml(from_file) + + fields = sub_config.pop("substitution-fields") + + subs = {} + subs.update(file_context) + # We've popped away the configuration; everything left in `sub_config` is + # substitution key/value pairs. + subs.update(sub_config.pop("from-object", {})) + subs.update(params_context) + + # Now that we have our combined context, we can substitute. + for field in fields: + container, subfield = job, field + while "." in subfield: + f, subfield = subfield.split(".", 1) + container = container[f] + + container[subfield] = substitute(container[subfield], **subs) + + yield job diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/__init__.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/__init__.py diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/archive.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/archive.py new file mode 100644 index 0000000000..ee59ba4548 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/archive.py @@ -0,0 +1,86 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import gzip +import os +import stat +import tarfile + +# 2016-01-01T00:00:00+0000 +DEFAULT_MTIME = 1451606400 + + +def create_tar_from_files(fp, files): + """Create a tar file deterministically. + + Receives a dict mapping names of files in the archive to local filesystem + paths or ``mozpack.files.BaseFile`` instances. + + The files will be archived and written to the passed file handle opened + for writing. + + Only regular files can be written. + + FUTURE accept a filename argument (or create APIs to write files) + """ + with tarfile.open(name="", mode="w", fileobj=fp, dereference=True) as tf: + for archive_path, f in sorted(files.items()): + if isinstance(f, str): + mode = os.stat(f).st_mode + f = open(f, "rb") + else: + mode = 0o0644 + + ti = tarfile.TarInfo(archive_path) + ti.mode = mode + ti.type = tarfile.REGTYPE + + if not ti.isreg(): + raise ValueError("not a regular file: %s" % f) + + # Disallow setuid and setgid bits. This is an arbitrary restriction. + # However, since we set uid/gid to root:root, setuid and setgid + # would be a glaring security hole if the archive were + # uncompressed as root. + if ti.mode & (stat.S_ISUID | stat.S_ISGID): + raise ValueError("cannot add file with setuid or setgid set: " "%s" % f) + + # Set uid, gid, username, and group as deterministic values. + ti.uid = 0 + ti.gid = 0 + ti.uname = "" + ti.gname = "" + + # Set mtime to a constant value. + ti.mtime = DEFAULT_MTIME + + f.seek(0, 2) + ti.size = f.tell() + f.seek(0, 0) + # tarfile wants to pass a size argument to read(). So just + # wrap/buffer in a proper file object interface. + tf.addfile(ti, f) + + +def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9): + """Create a tar.gz file deterministically from files. + + This is a glorified wrapper around ``create_tar_from_files`` that + adds gzip compression. + + The passed file handle should be opened for writing in binary mode. + When the function returns, all data has been written to the handle. + """ + # Offset 3-7 in the gzip header contains an mtime. Pin it to a known + # value so output is deterministic. + gf = gzip.GzipFile( + filename=filename or "", + mode="wb", + fileobj=fp, + compresslevel=compresslevel, + mtime=DEFAULT_MTIME, + ) + with gf: + create_tar_from_files(gf, files) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/attributes.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/attributes.py new file mode 100644 index 0000000000..74d6996629 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/attributes.py @@ -0,0 +1,96 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import re + + +def attrmatch(attributes, **kwargs): + """Determine whether the given set of task attributes matches. + + The conditions are given as keyword arguments, where each keyword names an + attribute. The keyword value can be a literal, a set, or a callable: + + * A literal must match the attribute exactly. + * Given a set or list, the attribute value must be contained within it. + * A callable is called with the attribute value and returns a boolean. + + If an attribute is specified as a keyword argument but not present in the + task's attributes, the result is False. + + Args: + attributes (dict): The task's attributes object. + kwargs (dict): The conditions the task's attributes must satisfy in + order to match. + Returns: + bool: Whether the task's attributes match the conditions or not. + """ + for kwkey, kwval in kwargs.items(): + if kwkey not in attributes: + return False + attval = attributes[kwkey] + if isinstance(kwval, (set, list)): + if attval not in kwval: + return False + elif callable(kwval): + if not kwval(attval): + return False + elif kwval != attributes[kwkey]: + return False + return True + + +def keymatch(attributes, target): + """Determine if any keys in attributes are a match to target, then return + a list of matching values. First exact matches will be checked. Failing + that, regex matches and finally a default key. + """ + # exact match + if target in attributes: + return [attributes[target]] + + # regular expression match + matches = [v for k, v in attributes.items() if re.match(k + "$", target)] + if matches: + return matches + + # default + if "default" in attributes: + return [attributes["default"]] + + return [] + + +def _match_run_on(key, run_on): + """ + Determine whether the given parameter is included in the corresponding `run-on-attribute`. + """ + if "all" in run_on: + return True + return key in run_on + + +match_run_on_projects = _match_run_on +match_run_on_tasks_for = _match_run_on + + +def match_run_on_git_branches(git_branch, run_on_git_branches): + """ + Determine whether the given project is included in the `run-on-git-branches` parameter. + Allows 'all'. + """ + if "all" in run_on_git_branches: + return True + + for expected_git_branch_pattern in run_on_git_branches: + if re.match(expected_git_branch_pattern, git_branch): + return True + + return False + + +def sorted_unique_list(*args): + """Join one or more lists, and return a sorted list of unique members""" + combined = set().union(*args) + return sorted(combined) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/cached_tasks.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/cached_tasks.py new file mode 100644 index 0000000000..974b114902 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/cached_tasks.py @@ -0,0 +1,86 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import hashlib +import time + +TARGET_CACHE_INDEX = "{cache_prefix}.cache.level-{level}.{type}.{name}.hash.{digest}" +EXTRA_CACHE_INDEXES = [ + "{cache_prefix}.cache.level-{level}.{type}.{name}.latest", + "{cache_prefix}.cache.level-{level}.{type}.{name}.pushdate.{build_date_long}", +] + + +def add_optimization( + config, taskdesc, cache_type, cache_name, digest=None, digest_data=None +): + """ + Allow the results of this task to be cached. This adds index routes to the + task so it can be looked up for future runs, and optimization hints so that + cached artifacts can be found. Exactly one of `digest` and `digest_data` + must be passed. + + :param TransformConfig config: The configuration for the kind being transformed. + :param dict taskdesc: The description of the current task. + :param str cache_type: The type of task result being cached. + :param str cache_name: The name of the object being cached. + :param digest: A unique string identifying this version of the artifacts + being generated. Typically this will be the hash of inputs to the task. + :type digest: bytes or None + :param digest_data: A list of bytes representing the inputs of this task. + They will be concatenated and hashed to create the digest for this + task. + :type digest_data: list of bytes or None + """ + if (digest is None) == (digest_data is None): + raise Exception("Must pass exactly one of `digest` and `digest_data`.") + if digest is None: + digest = hashlib.sha256("\n".join(digest_data).encode("utf-8")).hexdigest() + + if "cached-task-prefix" in config.graph_config["taskgraph"]: + cache_prefix = config.graph_config["taskgraph"]["cached-task-prefix"] + else: + cache_prefix = config.graph_config["trust-domain"] + + subs = { + "cache_prefix": cache_prefix, + "type": cache_type, + "name": cache_name, + "digest": digest, + } + + # We'll try to find a cached version of the toolchain at levels above and + # including the current level, starting at the highest level. + # Chain-of-trust doesn't handle tasks not built on the tip of a + # pull-request, so don't look for level-1 tasks if building a pull-request. + index_routes = [] + min_level = int(config.params["level"]) + if config.params["tasks_for"] == "github-pull-request": + min_level = max(min_level, 3) + for level in reversed(range(min_level, 4)): + subs["level"] = level + index_routes.append(TARGET_CACHE_INDEX.format(**subs)) + + taskdesc["optimization"] = {"index-search": index_routes} + + # ... and cache at the lowest level. + subs["level"] = config.params["level"] + taskdesc.setdefault("routes", []).append( + f"index.{TARGET_CACHE_INDEX.format(**subs)}" + ) + + # ... and add some extra routes for humans + subs["build_date_long"] = time.strftime( + "%Y.%m.%d.%Y%m%d%H%M%S", time.gmtime(config.params["build_date"]) + ) + taskdesc["routes"].extend( + [f"index.{route.format(**subs)}" for route in EXTRA_CACHE_INDEXES] + ) + + taskdesc["attributes"]["cached_task"] = { + "type": cache_type, + "name": cache_name, + "digest": digest, + } diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/decision.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/decision.py new file mode 100644 index 0000000000..d0e1e1079f --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/decision.py @@ -0,0 +1,79 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Utilities for generating a decision task from :file:`.taskcluster.yml`. +""" + + +import os + +import jsone +import slugid +import yaml + +from .templates import merge +from .time import current_json_time +from .vcs import find_hg_revision_push_info + + +def make_decision_task(params, root, context, head_rev=None): + """Generate a basic decision task, based on the root .taskcluster.yml""" + with open(os.path.join(root, ".taskcluster.yml"), "rb") as f: + taskcluster_yml = yaml.safe_load(f) + + if not head_rev: + head_rev = params["head_rev"] + + if params["repository_type"] == "hg": + pushlog = find_hg_revision_push_info(params["repository_url"], head_rev) + + hg_push_context = { + "pushlog_id": pushlog["pushid"], + "pushdate": pushlog["pushdate"], + "owner": pushlog["user"], + } + else: + hg_push_context = {} + + slugids = {} + + def as_slugid(name): + # https://github.com/taskcluster/json-e/issues/164 + name = name[0] + if name not in slugids: + slugids[name] = slugid.nice() + return slugids[name] + + # provide a similar JSON-e context to what mozilla-taskcluster provides: + # https://docs.taskcluster.net/reference/integrations/mozilla-taskcluster/docs/taskcluster-yml + # but with a different tasks_for and an extra `cron` section + context = merge( + { + "repository": { + "url": params["repository_url"], + "project": params["project"], + "level": params["level"], + }, + "push": merge( + { + "revision": params["head_rev"], + # remainder are fake values, but the decision task expects them anyway + "comment": " ", + }, + hg_push_context, + ), + "now": current_json_time(), + "as_slugid": as_slugid, + }, + context, + ) + + rendered = jsone.render(taskcluster_yml, context) + if len(rendered["tasks"]) != 1: + raise Exception("Expected .taskcluster.yml to only produce one cron task") + task = rendered["tasks"][0] + + task_id = task.pop("taskId") + return (task_id, task) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/dependencies.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/dependencies.py new file mode 100644 index 0000000000..d33aa3d7f2 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/dependencies.py @@ -0,0 +1,92 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from typing import Dict, Iterator, Optional + +from taskgraph.task import Task +from taskgraph.transforms.base import TransformConfig +from taskgraph.util.schema import Schema + +# Define a collection of group_by functions +GROUP_BY_MAP = {} + + +def group_by(name, schema=None): + def wrapper(func): + assert ( + name not in GROUP_BY_MAP + ), f"duplicate group_by function name {name} ({func} and {GROUP_BY_MAP[name]})" + GROUP_BY_MAP[name] = func + func.schema = schema + return func + + return wrapper + + +@group_by("single") +def group_by_single(config, tasks): + for task in tasks: + yield [task] + + +@group_by("all") +def group_by_all(config, tasks): + return [[task for task in tasks]] + + +@group_by("attribute", schema=Schema(str)) +def group_by_attribute(config, tasks, attr): + groups = {} + for task in tasks: + val = task.attributes.get(attr) + if not val: + continue + groups.setdefault(val, []).append(task) + + return groups.values() + + +def get_dependencies(config: TransformConfig, task: Dict) -> Iterator[Task]: + """Iterate over all dependencies as ``Task`` objects. + + Args: + config (TransformConfig): The ``TransformConfig`` object associated + with the kind. + task (Dict): The task dictionary to retrieve dependencies from. + + Returns: + Iterator[Task]: Returns a generator that iterates over the ``Task`` + objects associated with each dependency. + """ + if "dependencies" not in task: + return [] + + for label, dep in config.kind_dependencies_tasks.items(): + if label in task["dependencies"].values(): + yield dep + + +def get_primary_dependency(config: TransformConfig, task: Dict) -> Optional[Task]: + """Return the ``Task`` object associated with the primary dependency. + + This uses the task's ``primary-kind-dependency`` attribute to find the primary + dependency, or returns ``None`` if the attribute is unset. + + Args: + config (TransformConfig): The ``TransformConfig`` object associated + with the kind. + task (Dict): The task dictionary to retrieve the primary dependency from. + + Returns: + Optional[Task]: The ``Task`` object associated with the + primary dependency or ``None``. + """ + try: + primary_kind = task["attributes"]["primary-kind-dependency"] + except KeyError: + return None + + for dep in get_dependencies(config, task): + if dep.kind == primary_kind: + return dep diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/docker.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/docker.py new file mode 100644 index 0000000000..c37a69f98f --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/docker.py @@ -0,0 +1,237 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import hashlib +import io +import os +import re + +from taskgraph.util.archive import create_tar_gz_from_files +from taskgraph.util.memoize import memoize + +IMAGE_DIR = os.path.join(".", "taskcluster", "docker") + +from .yaml import load_yaml + + +def docker_image(name, by_tag=False): + """ + Resolve in-tree prebuilt docker image to ``<registry>/<repository>@sha256:<digest>``, + or ``<registry>/<repository>:<tag>`` if `by_tag` is `True`. + """ + try: + with open(os.path.join(IMAGE_DIR, name, "REGISTRY")) as f: + registry = f.read().strip() + except OSError: + with open(os.path.join(IMAGE_DIR, "REGISTRY")) as f: + registry = f.read().strip() + + if not by_tag: + hashfile = os.path.join(IMAGE_DIR, name, "HASH") + try: + with open(hashfile) as f: + return f"{registry}/{name}@{f.read().strip()}" + except OSError: + raise Exception(f"Failed to read HASH file {hashfile}") + + try: + with open(os.path.join(IMAGE_DIR, name, "VERSION")) as f: + tag = f.read().strip() + except OSError: + tag = "latest" + return f"{registry}/{name}:{tag}" + + +class VoidWriter: + """A file object with write capabilities that does nothing with the written + data.""" + + def write(self, buf): + pass + + +def generate_context_hash(topsrcdir, image_path, args=None): + """Generates a sha256 hash for context directory used to build an image.""" + + return stream_context_tar(topsrcdir, image_path, VoidWriter(), args=args) + + +class HashingWriter: + """A file object with write capabilities that hashes the written data at + the same time it passes down to a real file object.""" + + def __init__(self, writer): + self._hash = hashlib.sha256() + self._writer = writer + + def write(self, buf): + self._hash.update(buf) + self._writer.write(buf) + + def hexdigest(self): + return self._hash.hexdigest() + + +def create_context_tar(topsrcdir, context_dir, out_path, args=None): + """Create a context tarball. + + A directory ``context_dir`` containing a Dockerfile will be assembled into + a gzipped tar file at ``out_path``. + + We also scan the source Dockerfile for special syntax that influences + context generation. + + If a line in the Dockerfile has the form ``# %include <path>``, + the relative path specified on that line will be matched against + files in the source repository and added to the context under the + path ``topsrcdir/``. If an entry is a directory, we add all files + under that directory. + + If a line in the Dockerfile has the form ``# %ARG <name>``, occurrences of + the string ``$<name>`` in subsequent lines are replaced with the value + found in the ``args`` argument. Exception: this doesn't apply to VOLUME + definitions. + + Returns the SHA-256 hex digest of the created archive. + """ + with open(out_path, "wb") as fh: + return stream_context_tar( + topsrcdir, + context_dir, + fh, + image_name=os.path.basename(out_path), + args=args, + ) + + +RUN_TASK_ROOT = os.path.join(os.path.dirname(os.path.dirname(__file__)), "run-task") +RUN_TASK_FILES = { + f"run-task/{path}": os.path.join(RUN_TASK_ROOT, path) + for path in [ + "run-task", + "fetch-content", + "hgrc", + "robustcheckout.py", + ] +} +RUN_TASK_SNIPPET = [ + "COPY run-task/run-task /usr/local/bin/run-task\n", + "COPY run-task/fetch-content /usr/local/bin/fetch-content\n", + "COPY run-task/robustcheckout.py /usr/local/mercurial/robustcheckout.py\n" + "COPY run-task/hgrc /etc/mercurial/hgrc.d/mozilla.rc\n", +] + + +def stream_context_tar(topsrcdir, context_dir, out_file, image_name=None, args=None): + """Like create_context_tar, but streams the tar file to the `out_file` file + object.""" + archive_files = {} + replace = [] + content = [] + + topsrcdir = os.path.abspath(topsrcdir) + context_dir = os.path.join(topsrcdir, context_dir) + + for root, dirs, files in os.walk(context_dir): + for f in files: + source_path = os.path.join(root, f) + archive_path = source_path[len(context_dir) + 1 :] + archive_files[archive_path] = open(source_path, "rb") + + # Parse Dockerfile for special syntax of extra files to include. + content = [] + with open(os.path.join(context_dir, "Dockerfile")) as fh: + for line in fh: + if line.startswith("# %ARG"): + p = line[len("# %ARG ") :].strip() + if not args or p not in args: + raise Exception(f"missing argument: {p}") + replace.append((re.compile(rf"\${p}\b"), args[p])) + continue + + for regexp, s in replace: + line = re.sub(regexp, s, line) + + content.append(line) + + if not line.startswith("# %include"): + continue + + if line.strip() == "# %include-run-task": + content.extend(RUN_TASK_SNIPPET) + archive_files.update(RUN_TASK_FILES) + continue + + p = line[len("# %include ") :].strip() + if os.path.isabs(p): + raise Exception("extra include path cannot be absolute: %s" % p) + + fs_path = os.path.normpath(os.path.join(topsrcdir, p)) + # Check for filesystem traversal exploits. + if not fs_path.startswith(topsrcdir): + raise Exception("extra include path outside topsrcdir: %s" % p) + + if not os.path.exists(fs_path): + raise Exception("extra include path does not exist: %s" % p) + + if os.path.isdir(fs_path): + for root, dirs, files in os.walk(fs_path): + for f in files: + source_path = os.path.join(root, f) + rel = source_path[len(fs_path) + 1 :] + archive_path = os.path.join("topsrcdir", p, rel) + archive_files[archive_path] = source_path + else: + archive_path = os.path.join("topsrcdir", p) + archive_files[archive_path] = fs_path + + archive_files["Dockerfile"] = io.BytesIO("".join(content).encode("utf-8")) + + writer = HashingWriter(out_file) + create_tar_gz_from_files(writer, archive_files, image_name) + return writer.hexdigest() + + +@memoize +def image_paths(): + """Return a map of image name to paths containing their Dockerfile.""" + config = load_yaml("taskcluster", "ci", "docker-image", "kind.yml") + return { + k: os.path.join(IMAGE_DIR, v.get("definition", k)) + for k, v in config["tasks"].items() + } + + +def image_path(name): + paths = image_paths() + if name in paths: + return paths[name] + return os.path.join(IMAGE_DIR, name) + + +@memoize +def parse_volumes(image): + """Parse VOLUME entries from a Dockerfile for an image.""" + volumes = set() + + path = image_path(image) + + with open(os.path.join(path, "Dockerfile"), "rb") as fh: + for line in fh: + line = line.strip() + # We assume VOLUME definitions don't use %ARGS. + if not line.startswith(b"VOLUME "): + continue + + v = line.split(None, 1)[1] + if v.startswith(b"["): + raise ValueError( + "cannot parse array syntax for VOLUME; " + "convert to multiple entries" + ) + + volumes |= {volume.decode("utf-8") for volume in v.split()} + + return volumes diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/hash.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/hash.py new file mode 100644 index 0000000000..5d884fc318 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/hash.py @@ -0,0 +1,58 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import hashlib +from pathlib import Path + +from taskgraph.util import path as mozpath +from taskgraph.util.memoize import memoize + + +@memoize +def hash_path(path): + """Hash a single file. + + Returns the SHA-256 hash in hex form. + """ + with open(path, "rb") as fh: + return hashlib.sha256(fh.read()).hexdigest() + + +def hash_paths(base_path, patterns): + """ + Give a list of path patterns, return a digest of the contents of all + the corresponding files, similarly to git tree objects or mercurial + manifests. + + Each file is hashed. The list of all hashes and file paths is then + itself hashed to produce the result. + """ + h = hashlib.sha256() + + found = set() + for pattern in patterns: + matches = _find_matching_files(base_path, pattern) + if matches: + found.update(matches) + else: + raise Exception("%s did not match anything" % pattern) + for path in sorted(found): + h.update( + "{} {}\n".format( + hash_path(mozpath.abspath(mozpath.join(base_path, path))), + mozpath.normsep(path), + ).encode("utf-8") + ) + return h.hexdigest() + + +@memoize +def _find_matching_files(base_path, pattern): + files = _get_all_files(base_path) + return [path for path in files if mozpath.match(path, pattern)] + + +@memoize +def _get_all_files(base_path): + return [str(path) for path in Path(base_path).rglob("*") if path.is_file()] diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/keyed_by.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/keyed_by.py new file mode 100644 index 0000000000..9b0c5a44fb --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/keyed_by.py @@ -0,0 +1,97 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +from .attributes import keymatch + + +def evaluate_keyed_by( + value, item_name, attributes, defer=None, enforce_single_match=True +): + """ + For values which can either accept a literal value, or be keyed by some + attributes, perform that lookup and return the result. + + For example, given item:: + + by-test-platform: + macosx-10.11/debug: 13 + win.*: 6 + default: 12 + + a call to `evaluate_keyed_by(item, 'thing-name', {'test-platform': 'linux96')` + would return `12`. + + Items can be nested as deeply as desired:: + + by-test-platform: + win.*: + by-project: + ash: .. + cedar: .. + linux: 13 + default: 12 + + Args: + value (str): Name of the value to perform evaluation on. + item_name (str): Used to generate useful error messages. + attributes (dict): Dictionary of attributes used to lookup 'by-<key>' with. + defer (list): + Allows evaluating a by-* entry at a later time. In the example + above it's possible that the project attribute hasn't been set yet, + in which case we'd want to stop before resolving that subkey and + then call this function again later. This can be accomplished by + setting `defer=["project"]` in this example. + enforce_single_match (bool): + If True (default), each task may only match a single arm of the + evaluation. + """ + while True: + if not isinstance(value, dict) or len(value) != 1: + return value + value_key = next(iter(value)) + if not value_key.startswith("by-"): + return value + + keyed_by = value_key[3:] # strip off 'by-' prefix + + if defer and keyed_by in defer: + return value + + key = attributes.get(keyed_by) + alternatives = next(iter(value.values())) + + if len(alternatives) == 1 and "default" in alternatives: + # Error out when only 'default' is specified as only alternatives, + # because we don't need to by-{keyed_by} there. + raise Exception( + "Keyed-by '{}' unnecessary with only value 'default' " + "found, when determining item {}".format(keyed_by, item_name) + ) + + if key is None: + if "default" in alternatives: + value = alternatives["default"] + continue + else: + raise Exception( + "No attribute {} and no value for 'default' found " + "while determining item {}".format(keyed_by, item_name) + ) + + matches = keymatch(alternatives, key) + if enforce_single_match and len(matches) > 1: + raise Exception( + "Multiple matching values for {} {!r} found while " + "determining item {}".format(keyed_by, key, item_name) + ) + elif matches: + value = matches[0] + continue + + raise Exception( + "No {} matching {!r} nor 'default' found while determining item {}".format( + keyed_by, key, item_name + ) + ) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/memoize.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/memoize.py new file mode 100644 index 0000000000..56b513e74c --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/memoize.py @@ -0,0 +1,40 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +# Imported from +# https://searchfox.org/mozilla-central/rev/c3ebaf6de2d481c262c04bb9657eaf76bf47e2ac/python/mozbuild/mozbuild/util.py#923-949 + + +import functools + + +class memoize(dict): + """A decorator to memoize the results of function calls depending + on its arguments. + Both functions and instance methods are handled, although in the + instance method case, the results are cache in the instance itself. + """ + + def __init__(self, func): + self.func = func + functools.update_wrapper(self, func) + + def __call__(self, *args): + if args not in self: + self[args] = self.func(*args) + return self[args] + + def method_call(self, instance, *args): + name = "_%s" % self.func.__name__ + if not hasattr(instance, name): + setattr(instance, name, {}) + cache = getattr(instance, name) + if args not in cache: + cache[args] = self.func(instance, *args) + return cache[args] + + def __get__(self, instance, cls): + return functools.update_wrapper( + functools.partial(self.method_call, instance), self.func + ) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/parameterization.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/parameterization.py new file mode 100644 index 0000000000..6233a98a40 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/parameterization.py @@ -0,0 +1,97 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import re + +from taskgraph.util.taskcluster import get_artifact_url +from taskgraph.util.time import json_time_from_now + +TASK_REFERENCE_PATTERN = re.compile("<([^>]+)>") +ARTIFACT_REFERENCE_PATTERN = re.compile("<([^/]+)/([^>]+)>") + + +def _recurse(val, param_fns): + def recurse(val): + if isinstance(val, list): + return [recurse(v) for v in val] + elif isinstance(val, dict): + if len(val) == 1: + for param_key, param_fn in param_fns.items(): + if set(val.keys()) == {param_key}: + return param_fn(val[param_key]) + return {k: recurse(v) for k, v in val.items()} + else: + return val + + return recurse(val) + + +def resolve_timestamps(now, task_def): + """Resolve all instances of `{'relative-datestamp': '..'}` in the given task definition""" + return _recurse( + task_def, + { + "relative-datestamp": lambda v: json_time_from_now(v, now), + }, + ) + + +def resolve_task_references(label, task_def, task_id, decision_task_id, dependencies): + """Resolve all instances of ``{'task-reference': '..<..>..'} `` + and ``{'artifact-reference`: '..<dependency/artifact/path>..'}`` + in the given task definition, using the given dependencies. + """ + + def task_reference(val): + def repl(match): + key = match.group(1) + if key == "self": + return task_id + elif key == "decision": + return decision_task_id + try: + return dependencies[key] + except KeyError: + # handle escaping '<' + if key == "<": + return key + raise KeyError(f"task '{label}' has no dependency named '{key}'") + + return TASK_REFERENCE_PATTERN.sub(repl, val) + + def artifact_reference(val): + def repl(match): + dependency, artifact_name = match.group(1, 2) + + if dependency == "self": + raise KeyError(f"task '{label}' can't reference artifacts of self") + elif dependency == "decision": + task_id = decision_task_id + else: + try: + task_id = dependencies[dependency] + except KeyError: + raise KeyError( + "task '{}' has no dependency named '{}'".format( + label, dependency + ) + ) + + assert artifact_name.startswith( + "public/" + ), "artifact-reference only supports public artifacts, not `{}`".format( + artifact_name + ) + return get_artifact_url(task_id, artifact_name) + + return ARTIFACT_REFERENCE_PATTERN.sub(repl, val) + + return _recurse( + task_def, + { + "task-reference": task_reference, + "artifact-reference": artifact_reference, + }, + ) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/path.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/path.py new file mode 100644 index 0000000000..c725140b12 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/path.py @@ -0,0 +1,167 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Like :py:mod:`os.path`, with a reduced set of functions, and with normalized path +separators (always use forward slashes). +Also contains a few additional utilities not found in :py:mod:`os.path`. +""" + +# Imported from +# https://searchfox.org/mozilla-central/rev/c3ebaf6de2d481c262c04bb9657eaf76bf47e2ac/python/mozbuild/mozpack/path.py + + +import os +import posixpath +import re + + +def normsep(path): + """ + Normalize path separators, by using forward slashes instead of whatever + :py:const:`os.sep` is. + """ + if os.sep != "/": + path = path.replace(os.sep, "/") + if os.altsep and os.altsep != "/": + path = path.replace(os.altsep, "/") + return path + + +def relpath(path, start): + rel = normsep(os.path.relpath(path, start)) + return "" if rel == "." else rel + + +def realpath(path): + return normsep(os.path.realpath(path)) + + +def abspath(path): + return normsep(os.path.abspath(path)) + + +def join(*paths): + return normsep(os.path.join(*paths)) + + +def normpath(path): + return posixpath.normpath(normsep(path)) + + +def dirname(path): + return posixpath.dirname(normsep(path)) + + +def commonprefix(paths): + return posixpath.commonprefix([normsep(path) for path in paths]) + + +def basename(path): + return os.path.basename(path) + + +def splitext(path): + return posixpath.splitext(normsep(path)) + + +def split(path): + """ + Return the normalized path as a list of its components. + + ``split('foo/bar/baz')`` returns ``['foo', 'bar', 'baz']`` + """ + return normsep(path).split("/") + + +def basedir(path, bases): + """ + Given a list of directories (`bases`), return which one contains the given + path. If several matches are found, the deepest base directory is returned. + + ``basedir('foo/bar/baz', ['foo', 'baz', 'foo/bar'])`` returns ``'foo/bar'`` + (`'foo'` and `'foo/bar'` both match, but `'foo/bar'` is the deepest match) + """ + path = normsep(path) + bases = [normsep(b) for b in bases] + if path in bases: + return path + for b in sorted(bases, reverse=True): + if not b or path.startswith(b + "/"): + return b + + +re_cache = {} +MATCH_STAR_STAR_RE = re.compile(r"(^|/)\\\*\\\*/") +MATCH_STAR_STAR_END_RE = re.compile(r"(^|/)\\\*\\\*$") + + +def match(path, pattern): + """ + Return whether the given path matches the given pattern. + An asterisk can be used to match any string, including the null string, in + one part of the path: + + ``foo`` matches ``*``, ``f*`` or ``fo*o`` + + However, an asterisk matching a subdirectory may not match the null string: + + ``foo/bar`` does *not* match ``foo/*/bar`` + + If the pattern matches one of the ancestor directories of the path, the + patch is considered matching: + + ``foo/bar`` matches ``foo`` + + Two adjacent asterisks can be used to match files and zero or more + directories and subdirectories. + + ``foo/bar`` matches ``foo/**/bar``, or ``**/bar`` + """ + if not pattern: + return True + if pattern not in re_cache: + p = re.escape(pattern) + p = MATCH_STAR_STAR_RE.sub(r"\1(?:.+/)?", p) + p = MATCH_STAR_STAR_END_RE.sub(r"(?:\1.+)?", p) + p = p.replace(r"\*", "[^/]*") + "(?:/.*)?$" + re_cache[pattern] = re.compile(p) + return re_cache[pattern].match(path) is not None + + +def rebase(oldbase, base, relativepath): + """ + Return `relativepath` relative to `base` instead of `oldbase`. + """ + if base == oldbase: + return relativepath + if len(base) < len(oldbase): + assert basedir(oldbase, [base]) == base + relbase = relpath(oldbase, base) + result = join(relbase, relativepath) + else: + assert basedir(base, [oldbase]) == oldbase + relbase = relpath(base, oldbase) + result = relpath(relativepath, relbase) + result = normpath(result) + if relativepath.endswith("/") and not result.endswith("/"): + result += "/" + return result + + +def ancestors(path): + """Emit the parent directories of a path. + + Args: + path (str): Path to emit parents of. + + Yields: + str: Path of parent directory. + """ + while path: + yield path + newpath = os.path.dirname(path) + if newpath == path: + break + path = newpath diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/python_path.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/python_path.py new file mode 100644 index 0000000000..3eb61dfbf3 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/python_path.py @@ -0,0 +1,52 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import inspect +import os + + +def find_object(path): + """ + Find a Python object given a path of the form <modulepath>:<objectpath>. + Conceptually equivalent to + + def find_object(modulepath, objectpath): + import <modulepath> as mod + return mod.<objectpath> + """ + if path.count(":") != 1: + raise ValueError(f'python path {path!r} does not have the form "module:object"') + + modulepath, objectpath = path.split(":") + obj = __import__(modulepath) + for a in modulepath.split(".")[1:]: + obj = getattr(obj, a) + for a in objectpath.split("."): + obj = getattr(obj, a) + return obj + + +def import_sibling_modules(exceptions=None): + """ + Import all Python modules that are siblings of the calling module. + + Args: + exceptions (list): A list of file names to exclude (caller and + __init__.py are implicitly excluded). + """ + frame = inspect.stack()[1] + mod = inspect.getmodule(frame[0]) + + name = os.path.basename(mod.__file__) + excs = {"__init__.py", name} + if exceptions: + excs.update(exceptions) + + modpath = mod.__name__ + if not name.startswith("__init__.py"): + modpath = modpath.rsplit(".", 1)[0] + + for f in os.listdir(os.path.dirname(mod.__file__)): + if f.endswith(".py") and f not in excs: + __import__(modpath + "." + f[:-3]) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/readonlydict.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/readonlydict.py new file mode 100644 index 0000000000..55d74f479a --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/readonlydict.py @@ -0,0 +1,22 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +# Imported from +# https://searchfox.org/mozilla-central/rev/c3ebaf6de2d481c262c04bb9657eaf76bf47e2ac/python/mozbuild/mozbuild/util.py#115-127 + + +class ReadOnlyDict(dict): + """A read-only dictionary.""" + + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + def __delitem__(self, key): + raise Exception("Object does not support deletion.") + + def __setitem__(self, key, value): + raise Exception("Object does not support assignment.") + + def update(self, *args, **kwargs): + raise Exception("Object does not support update.") diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/schema.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/schema.py new file mode 100644 index 0000000000..3989f71182 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/schema.py @@ -0,0 +1,260 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import collections +import pprint +import re + +import voluptuous + +import taskgraph + +from .keyed_by import evaluate_keyed_by + + +def validate_schema(schema, obj, msg_prefix): + """ + Validate that object satisfies schema. If not, generate a useful exception + beginning with msg_prefix. + """ + if taskgraph.fast: + return + try: + schema(obj) + except voluptuous.MultipleInvalid as exc: + msg = [msg_prefix] + for error in exc.errors: + msg.append(str(error)) + raise Exception("\n".join(msg) + "\n" + pprint.pformat(obj)) + + +def optionally_keyed_by(*arguments): + """ + Mark a schema value as optionally keyed by any of a number of fields. The + schema is the last argument, and the remaining fields are taken to be the + field names. For example: + + 'some-value': optionally_keyed_by( + 'test-platform', 'build-platform', + Any('a', 'b', 'c')) + + The resulting schema will allow nesting of `by-test-platform` and + `by-build-platform` in either order. + """ + schema = arguments[-1] + fields = arguments[:-1] + + def validator(obj): + if isinstance(obj, dict) and len(obj) == 1: + k, v = list(obj.items())[0] + if k.startswith("by-") and k[len("by-") :] in fields: + res = {} + for kk, vv in v.items(): + try: + res[kk] = validator(vv) + except voluptuous.Invalid as e: + e.prepend([k, kk]) + raise + return res + return Schema(schema)(obj) + + return validator + + +def resolve_keyed_by( + item, field, item_name, defer=None, enforce_single_match=True, **extra_values +): + """ + For values which can either accept a literal value, or be keyed by some + other attribute of the item, perform that lookup and replacement in-place + (modifying `item` directly). The field is specified using dotted notation + to traverse dictionaries. + + For example, given item:: + + job: + test-platform: linux128 + chunks: + by-test-platform: + macosx-10.11/debug: 13 + win.*: 6 + default: 12 + + a call to `resolve_keyed_by(item, 'job.chunks', item['thing-name'])` + would mutate item in-place to:: + + job: + test-platform: linux128 + chunks: 12 + + The `item_name` parameter is used to generate useful error messages. + + If extra_values are supplied, they represent additional values available + for reference from by-<field>. + + Items can be nested as deeply as the schema will allow:: + + chunks: + by-test-platform: + win.*: + by-project: + ash: .. + cedar: .. + linux: 13 + default: 12 + + Args: + item (dict): Object being evaluated. + field (str): Name of the key to perform evaluation on. + item_name (str): Used to generate useful error messages. + defer (list): + Allows evaluating a by-* entry at a later time. In the example + above it's possible that the project attribute hasn't been set yet, + in which case we'd want to stop before resolving that subkey and + then call this function again later. This can be accomplished by + setting `defer=["project"]` in this example. + enforce_single_match (bool): + If True (default), each task may only match a single arm of the + evaluation. + extra_values (kwargs): + If supplied, represent additional values available + for reference from by-<field>. + + Returns: + dict: item which has also been modified in-place. + """ + # find the field, returning the item unchanged if anything goes wrong + container, subfield = item, field + while "." in subfield: + f, subfield = subfield.split(".", 1) + if f not in container: + return item + container = container[f] + if not isinstance(container, dict): + return item + + if subfield not in container: + return item + + container[subfield] = evaluate_keyed_by( + value=container[subfield], + item_name=f"`{field}` in `{item_name}`", + defer=defer, + enforce_single_match=enforce_single_match, + attributes=dict(item, **extra_values), + ) + + return item + + +# Schemas for YAML files should use dashed identifiers by default. If there are +# components of the schema for which there is a good reason to use another format, +# they can be excepted here. +EXCEPTED_SCHEMA_IDENTIFIERS = [ + # upstream-artifacts and artifact-map are handed directly to scriptWorker, + # which expects interCaps + "upstream-artifacts", + "artifact-map", +] + + +def check_schema(schema): + identifier_re = re.compile(r"^\$?[a-z][a-z0-9-]*$") + + def excepted(item): + for esi in EXCEPTED_SCHEMA_IDENTIFIERS: + if isinstance(esi, str): + if f"[{esi!r}]" in item: + return True + elif esi(item): + return True + return False + + def iter(path, sch): + def check_identifier(path, k): + if k in (str,) or k in (str, voluptuous.Extra): + pass + elif isinstance(k, voluptuous.NotIn): + pass + elif isinstance(k, str): + if not identifier_re.match(k) and not excepted(path): + raise RuntimeError( + "YAML schemas should use dashed lower-case identifiers, " + "not {!r} @ {}".format(k, path) + ) + elif isinstance(k, (voluptuous.Optional, voluptuous.Required)): + check_identifier(path, k.schema) + elif isinstance(k, (voluptuous.Any, voluptuous.All)): + for v in k.validators: + check_identifier(path, v) + elif not excepted(path): + raise RuntimeError( + "Unexpected type in YAML schema: {} @ {}".format( + type(k).__name__, path + ) + ) + + if isinstance(sch, collections.abc.Mapping): + for k, v in sch.items(): + child = f"{path}[{k!r}]" + check_identifier(child, k) + iter(child, v) + elif isinstance(sch, (list, tuple)): + for i, v in enumerate(sch): + iter(f"{path}[{i}]", v) + elif isinstance(sch, voluptuous.Any): + for v in sch.validators: + iter(path, v) + + iter("schema", schema.schema) + + +class Schema(voluptuous.Schema): + """ + Operates identically to voluptuous.Schema, but applying some taskgraph-specific checks + in the process. + """ + + def __init__(self, *args, check=True, **kwargs): + super().__init__(*args, **kwargs) + + self.check = check + if not taskgraph.fast and self.check: + check_schema(self) + + def extend(self, *args, **kwargs): + schema = super().extend(*args, **kwargs) + + if self.check: + check_schema(schema) + # We want twice extend schema to be checked too. + schema.__class__ = Schema + return schema + + def _compile(self, schema): + if taskgraph.fast: + return + return super()._compile(schema) + + def __getitem__(self, item): + return self.schema[item] + + +OptimizationSchema = voluptuous.Any( + # always run this task (default) + None, + # search the index for the given index namespaces, and replace this task if found + # the search occurs in order, with the first match winning + {"index-search": [str]}, + # skip this task if none of the given file patterns match + {"skip-unless-changed": [str]}, +) + +# shortcut for a string where task references are allowed +taskref_or_string = voluptuous.Any( + str, + {voluptuous.Required("task-reference"): str}, + {voluptuous.Required("artifact-reference"): str}, +) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/shell.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/shell.py new file mode 100644 index 0000000000..d695767f05 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/shell.py @@ -0,0 +1,40 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import re + +SHELL_QUOTE_RE = re.compile(r"[\\\t\r\n \'\"#<>&|`(){}$;\*\?]") + + +def _quote(s): + """Given a string, returns a version that can be used literally on a shell + command line, enclosing it with single quotes if necessary. + + As a special case, if given an int, returns a string containing the int, + not enclosed in quotes. + """ + if type(s) == int: + return "%d" % s + + # Empty strings need to be quoted to have any significance + if s and not SHELL_QUOTE_RE.search(s) and not s.startswith("~"): + return s + + # Single quoted strings can contain any characters unescaped except the + # single quote itself, which can't even be escaped, so the string needs to + # be closed, an escaped single quote added, and reopened. + t = type(s) + return t("'%s'") % s.replace(t("'"), t("'\\''")) + + +def quote(*strings): + """Given one or more strings, returns a quoted string that can be used + literally on a shell command line. + + >>> quote('a', 'b') + "a b" + >>> quote('a b', 'c') + "'a b' c" + """ + return " ".join(_quote(s) for s in strings) diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/taskcluster.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/taskcluster.py new file mode 100644 index 0000000000..a830a473b3 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/taskcluster.py @@ -0,0 +1,373 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import datetime +import functools +import logging +import os + +import requests +import taskcluster_urls as liburls +from requests.packages.urllib3.util.retry import Retry + +from taskgraph.task import Task +from taskgraph.util import yaml +from taskgraph.util.memoize import memoize + +logger = logging.getLogger(__name__) + +# this is set to true for `mach taskgraph action-callback --test` +testing = False + +# Default rootUrl to use if none is given in the environment; this should point +# to the production Taskcluster deployment used for CI. +PRODUCTION_TASKCLUSTER_ROOT_URL = None + +# the maximum number of parallel Taskcluster API calls to make +CONCURRENCY = 50 + + +@memoize +def get_root_url(use_proxy): + """Get the current TASKCLUSTER_ROOT_URL. + + When running in a task, this must come from $TASKCLUSTER_ROOT_URL; when run + on the command line, a default may be provided that points to the + production deployment of Taskcluster. If use_proxy is set, this attempts to + get TASKCLUSTER_PROXY_URL instead, failing if it is not set. + """ + if use_proxy: + try: + return liburls.normalize_root_url(os.environ["TASKCLUSTER_PROXY_URL"]) + except KeyError: + if "TASK_ID" not in os.environ: + raise RuntimeError( + "taskcluster-proxy is not available when not executing in a task" + ) + else: + raise RuntimeError("taskcluster-proxy is not enabled for this task") + + if "TASKCLUSTER_ROOT_URL" in os.environ: + logger.debug( + "Running in Taskcluster instance {}{}".format( + os.environ["TASKCLUSTER_ROOT_URL"], + " with taskcluster-proxy" + if "TASKCLUSTER_PROXY_URL" in os.environ + else "", + ) + ) + return liburls.normalize_root_url(os.environ["TASKCLUSTER_ROOT_URL"]) + + if "TASK_ID" in os.environ: + raise RuntimeError("$TASKCLUSTER_ROOT_URL must be set when running in a task") + + if PRODUCTION_TASKCLUSTER_ROOT_URL is None: + raise RuntimeError( + "Could not detect Taskcluster instance, set $TASKCLUSTER_ROOT_URL" + ) + + logger.debug("Using default TASKCLUSTER_ROOT_URL") + return liburls.normalize_root_url(PRODUCTION_TASKCLUSTER_ROOT_URL) + + +def requests_retry_session( + retries, + backoff_factor=0.1, + status_forcelist=(500, 502, 503, 504), + concurrency=CONCURRENCY, + session=None, +): + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + + # Default HTTPAdapter uses 10 connections. Mount custom adapter to increase + # that limit. Connections are established as needed, so using a large value + # should not negatively impact performance. + http_adapter = requests.adapters.HTTPAdapter( + pool_connections=concurrency, + pool_maxsize=concurrency, + max_retries=retry, + ) + session.mount("http://", http_adapter) + session.mount("https://", http_adapter) + + return session + + +@memoize +def get_session(): + return requests_retry_session(retries=5) + + +def _do_request(url, method=None, **kwargs): + if method is None: + method = "post" if kwargs else "get" + + session = get_session() + if method == "get": + kwargs["stream"] = True + + response = getattr(session, method)(url, **kwargs) + + if response.status_code >= 400: + # Consume content before raise_for_status, so that the connection can be + # reused. + response.content + response.raise_for_status() + return response + + +def _handle_artifact(path, response): + if path.endswith(".json"): + return response.json() + if path.endswith(".yml"): + return yaml.load_stream(response.text) + response.raw.read = functools.partial(response.raw.read, decode_content=True) + return response.raw + + +def get_artifact_url(task_id, path, use_proxy=False): + artifact_tmpl = liburls.api( + get_root_url(False), "queue", "v1", "task/{}/artifacts/{}" + ) + data = artifact_tmpl.format(task_id, path) + if use_proxy: + # Until Bug 1405889 is deployed, we can't download directly + # from the taskcluster-proxy. Work around by using the /bewit + # endpoint instead. + # The bewit URL is the body of a 303 redirect, which we don't + # want to follow (which fetches a potentially large resource). + response = _do_request( + os.environ["TASKCLUSTER_PROXY_URL"] + "/bewit", + data=data, + allow_redirects=False, + ) + return response.text + return data + + +def get_artifact(task_id, path, use_proxy=False): + """ + Returns the artifact with the given path for the given task id. + + If the path ends with ".json" or ".yml", the content is deserialized as, + respectively, json or yaml, and the corresponding python data (usually + dict) is returned. + For other types of content, a file-like object is returned. + """ + response = _do_request(get_artifact_url(task_id, path, use_proxy)) + return _handle_artifact(path, response) + + +def list_artifacts(task_id, use_proxy=False): + response = _do_request(get_artifact_url(task_id, "", use_proxy).rstrip("/")) + return response.json()["artifacts"] + + +def get_artifact_prefix(task): + prefix = None + if isinstance(task, dict): + prefix = task.get("attributes", {}).get("artifact_prefix") + elif isinstance(task, Task): + prefix = task.attributes.get("artifact_prefix") + else: + raise Exception(f"Can't find artifact-prefix of non-task: {task}") + return prefix or "public/build" + + +def get_artifact_path(task, path): + return f"{get_artifact_prefix(task)}/{path}" + + +def get_index_url(index_path, use_proxy=False, multiple=False): + index_tmpl = liburls.api(get_root_url(use_proxy), "index", "v1", "task{}/{}") + return index_tmpl.format("s" if multiple else "", index_path) + + +def find_task_id(index_path, use_proxy=False): + try: + response = _do_request(get_index_url(index_path, use_proxy)) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + raise KeyError(f"index path {index_path} not found") + raise + return response.json()["taskId"] + + +def get_artifact_from_index(index_path, artifact_path, use_proxy=False): + full_path = index_path + "/artifacts/" + artifact_path + response = _do_request(get_index_url(full_path, use_proxy)) + return _handle_artifact(full_path, response) + + +def list_tasks(index_path, use_proxy=False): + """ + Returns a list of task_ids where each task_id is indexed under a path + in the index. Results are sorted by expiration date from oldest to newest. + """ + results = [] + data = {} + while True: + response = _do_request( + get_index_url(index_path, use_proxy, multiple=True), json=data + ) + response = response.json() + results += response["tasks"] + if response.get("continuationToken"): + data = {"continuationToken": response.get("continuationToken")} + else: + break + + # We can sort on expires because in the general case + # all of these tasks should be created with the same expires time so they end up in + # order from earliest to latest action. If more correctness is needed, consider + # fetching each task and sorting on the created date. + results.sort(key=lambda t: parse_time(t["expires"])) + return [t["taskId"] for t in results] + + +def parse_time(timestamp): + """Turn a "JSON timestamp" as used in TC APIs into a datetime""" + return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ") + + +def get_task_url(task_id, use_proxy=False): + task_tmpl = liburls.api(get_root_url(use_proxy), "queue", "v1", "task/{}") + return task_tmpl.format(task_id) + + +def get_task_definition(task_id, use_proxy=False): + response = _do_request(get_task_url(task_id, use_proxy)) + return response.json() + + +def cancel_task(task_id, use_proxy=False): + """Cancels a task given a task_id. In testing mode, just logs that it would + have cancelled.""" + if testing: + logger.info(f"Would have cancelled {task_id}.") + else: + _do_request(get_task_url(task_id, use_proxy) + "/cancel", json={}) + + +def status_task(task_id, use_proxy=False): + """Gets the status of a task given a task_id. + + In testing mode, just logs that it would have retrieved status. + + Args: + task_id (str): A task id. + use_proxy (bool): Whether to use taskcluster-proxy (default: False) + + Returns: + dict: A dictionary object as defined here: + https://docs.taskcluster.net/docs/reference/platform/queue/api#status + """ + if testing: + logger.info(f"Would have gotten status for {task_id}.") + else: + resp = _do_request(get_task_url(task_id, use_proxy) + "/status") + status = resp.json().get("status", {}) + return status + + +def state_task(task_id, use_proxy=False): + """Gets the state of a task given a task_id. + + In testing mode, just logs that it would have retrieved state. This is a subset of the + data returned by :func:`status_task`. + + Args: + task_id (str): A task id. + use_proxy (bool): Whether to use taskcluster-proxy (default: False) + + Returns: + str: The state of the task, one of + ``pending, running, completed, failed, exception, unknown``. + """ + if testing: + logger.info(f"Would have gotten state for {task_id}.") + else: + status = status_task(task_id, use_proxy=use_proxy).get("state") or "unknown" + return status + + +def rerun_task(task_id): + """Reruns a task given a task_id. In testing mode, just logs that it would + have reran.""" + if testing: + logger.info(f"Would have rerun {task_id}.") + else: + _do_request(get_task_url(task_id, use_proxy=True) + "/rerun", json={}) + + +def get_current_scopes(): + """Get the current scopes. This only makes sense in a task with the Taskcluster + proxy enabled, where it returns the actual scopes accorded to the task.""" + auth_url = liburls.api(get_root_url(True), "auth", "v1", "scopes/current") + resp = _do_request(auth_url) + return resp.json().get("scopes", []) + + +def get_purge_cache_url(provisioner_id, worker_type, use_proxy=False): + url_tmpl = liburls.api( + get_root_url(use_proxy), "purge-cache", "v1", "purge-cache/{}/{}" + ) + return url_tmpl.format(provisioner_id, worker_type) + + +def purge_cache(provisioner_id, worker_type, cache_name, use_proxy=False): + """Requests a cache purge from the purge-caches service.""" + if testing: + logger.info( + "Would have purged {}/{}/{}.".format( + provisioner_id, worker_type, cache_name + ) + ) + else: + logger.info(f"Purging {provisioner_id}/{worker_type}/{cache_name}.") + purge_cache_url = get_purge_cache_url(provisioner_id, worker_type, use_proxy) + _do_request(purge_cache_url, json={"cacheName": cache_name}) + + +def send_email(address, subject, content, link, use_proxy=False): + """Sends an email using the notify service""" + logger.info(f"Sending email to {address}.") + url = liburls.api(get_root_url(use_proxy), "notify", "v1", "email") + _do_request( + url, + json={ + "address": address, + "subject": subject, + "content": content, + "link": link, + }, + ) + + +def list_task_group_incomplete_tasks(task_group_id): + """Generate the incomplete tasks in a task group""" + params = {} + while True: + url = liburls.api( + get_root_url(False), + "queue", + "v1", + f"task-group/{task_group_id}/list", + ) + resp = _do_request(url, method="get", params=params).json() + for task in [t["status"] for t in resp["tasks"]]: + if task["state"] in ["running", "pending", "unscheduled"]: + yield task["taskId"] + if resp.get("continuationToken"): + params = {"continuationToken": resp.get("continuationToken")} + else: + break diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/taskgraph.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/taskgraph.py new file mode 100644 index 0000000000..7b545595ef --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/taskgraph.py @@ -0,0 +1,54 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Tools for interacting with existing taskgraphs. +""" + + +from taskgraph.util.taskcluster import find_task_id, get_artifact + + +def find_decision_task(parameters, graph_config): + """Given the parameters for this action, find the taskId of the decision + task""" + if parameters.get("repository_type", "hg") == "hg": + return find_task_id( + "{}.v2.{}.pushlog-id.{}.decision".format( + graph_config["trust-domain"], + parameters["project"], + parameters["pushlog_id"], + ) + ) + elif parameters["repository_type"] == "git": + return find_task_id( + "{}.v2.{}.revision.{}.taskgraph.decision".format( + graph_config["trust-domain"], + parameters["project"], + parameters["head_rev"], + ) + ) + else: + raise Exception( + "Unknown repository_type {}!".format(parameters["repository_type"]) + ) + + +def find_existing_tasks_from_previous_kinds( + full_task_graph, previous_graph_ids, rebuild_kinds +): + """Given a list of previous decision/action taskIds and kinds to ignore + from the previous graphs, return a dictionary of labels-to-taskids to use + as ``existing_tasks`` in the optimization step.""" + existing_tasks = {} + for previous_graph_id in previous_graph_ids: + label_to_taskid = get_artifact(previous_graph_id, "public/label-to-taskid.json") + kind_labels = { + t.label + for t in full_task_graph.tasks.values() + if t.attributes["kind"] not in rebuild_kinds + } + for label in set(label_to_taskid.keys()).intersection(kind_labels): + existing_tasks[label] = label_to_taskid[label] + return existing_tasks diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/templates.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/templates.py new file mode 100644 index 0000000000..23cd5f8d68 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/templates.py @@ -0,0 +1,80 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import copy + + +def merge_to(source, dest): + """ + Merge dict and arrays (override scalar values) + + Keys from source override keys from dest, and elements from lists in source + are appended to lists in dest. + + :param dict source: to copy from + :param dict dest: to copy to (modified in place) + """ + + for key, value in source.items(): + # Override mismatching or empty types + if type(value) != type(dest.get(key)): # noqa + dest[key] = source[key] + continue + + # Merge dict + if isinstance(value, dict): + merge_to(value, dest[key]) + continue + + if isinstance(value, list): + dest[key] = dest[key] + source[key] + continue + + dest[key] = source[key] + + return dest + + +def merge(*objects): + """ + Merge the given objects, using the semantics described for merge_to, with + objects later in the list taking precedence. From an inheritance + perspective, "parents" should be listed before "children". + + Returns the result without modifying any arguments. + """ + if len(objects) == 1: + return copy.deepcopy(objects[0]) + return merge_to(objects[-1], merge(*objects[:-1])) + + +def deep_get(dict_, field): + container, subfield = dict_, field + while "." in subfield: + f, subfield = subfield.split(".", 1) + if f not in container: + return None + + container = container[f] + + return container.get(subfield) + + +def substitute(item, **subs): + if isinstance(item, list): + for i in range(len(item)): + item[i] = substitute(item[i], **subs) + elif isinstance(item, dict): + new_dict = {} + for k, v in item.items(): + k = k.format(**subs) + new_dict[k] = substitute(v, **subs) + item = new_dict + elif isinstance(item, str): + item = item.format(**subs) + else: + item = item + + return item diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/time.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/time.py new file mode 100644 index 0000000000..e511978b5f --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/time.py @@ -0,0 +1,115 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Python port of the ms.js node module this is not a direct port some things are +# more complicated or less precise and we lean on time delta here. + + +import datetime +import re + +PATTERN = re.compile(r"((?:\d+)?\.?\d+) *([a-z]+)") + + +def seconds(value): + return datetime.timedelta(seconds=int(value)) + + +def minutes(value): + return datetime.timedelta(minutes=int(value)) + + +def hours(value): + return datetime.timedelta(hours=int(value)) + + +def days(value): + return datetime.timedelta(days=int(value)) + + +def months(value): + # See warning in years(), below + return datetime.timedelta(days=int(value) * 30) + + +def years(value): + # Warning here "years" are vague don't use this for really sensitive date + # computation the idea is to give you a absolute amount of time in the + # future which is not the same thing as "precisely on this date next year" + return datetime.timedelta(days=int(value) * 365) + + +ALIASES = {} +ALIASES["seconds"] = ALIASES["second"] = ALIASES["s"] = seconds +ALIASES["minutes"] = ALIASES["minute"] = ALIASES["min"] = minutes +ALIASES["hours"] = ALIASES["hour"] = ALIASES["h"] = hours +ALIASES["days"] = ALIASES["day"] = ALIASES["d"] = days +ALIASES["months"] = ALIASES["month"] = ALIASES["mo"] = months +ALIASES["years"] = ALIASES["year"] = ALIASES["y"] = years + + +class InvalidString(Exception): + pass + + +class UnknownTimeMeasurement(Exception): + pass + + +def value_of(input_str): + """ + Convert a string to a json date in the future + :param str input_str: (ex: 1d, 2d, 6years, 2 seconds) + :returns: Unit given in seconds + """ + + matches = PATTERN.search(input_str) + + if matches is None or len(matches.groups()) < 2: + raise InvalidString(f"'{input_str}' is invalid string") + + value, unit = matches.groups() + + if unit not in ALIASES: + raise UnknownTimeMeasurement( + "{} is not a valid time measure use one of {}".format( + unit, sorted(ALIASES.keys()) + ) + ) + + return ALIASES[unit](value) + + +def json_time_from_now(input_str, now=None, datetime_format=False): + """ + :param str input_str: Input string (see value of) + :param datetime now: Optionally set the definition of `now` + :param boolean datetime_format: Set `True` to get a `datetime` output + :returns: JSON string representation of time in future. + """ + + if now is None: + now = datetime.datetime.utcnow() + + time = now + value_of(input_str) + + if datetime_format is True: + return time + else: + # Sorta a big hack but the json schema validator for date does not like the + # ISO dates until 'Z' (for timezone) is added... + # Microseconds are excluded (see bug 1381801) + return time.isoformat(timespec="milliseconds") + "Z" + + +def current_json_time(datetime_format=False): + """ + :param boolean datetime_format: Set `True` to get a `datetime` output + :returns: JSON string representation of the current time. + """ + if datetime_format is True: + return datetime.datetime.utcnow() + else: + # Microseconds are excluded (see bug 1381801) + return datetime.datetime.utcnow().isoformat(timespec="milliseconds") + "Z" diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/treeherder.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/treeherder.py new file mode 100644 index 0000000000..cff5f286cc --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/treeherder.py @@ -0,0 +1,84 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import re + +_JOINED_SYMBOL_RE = re.compile(r"([^(]*)\(([^)]*)\)$") + + +def split_symbol(treeherder_symbol): + """Split a symbol expressed as grp(sym) into its two parts. If no group is + given, the returned group is '?'""" + groupSymbol = "?" + symbol = treeherder_symbol + if "(" in symbol: + match = _JOINED_SYMBOL_RE.match(symbol) + if match: + groupSymbol, symbol = match.groups() + else: + raise Exception(f"`{symbol}` is not a valid treeherder symbol.") + return groupSymbol, symbol + + +def join_symbol(group, symbol): + """Perform the reverse of split_symbol, combining the given group and + symbol. If the group is '?', then it is omitted.""" + if group == "?": + return symbol + return f"{group}({symbol})" + + +def add_suffix(treeherder_symbol, suffix): + """Add a suffix to a treeherder symbol that may contain a group.""" + group, symbol = split_symbol(treeherder_symbol) + symbol += str(suffix) + return join_symbol(group, symbol) + + +def replace_group(treeherder_symbol, new_group): + """Add a suffix to a treeherder symbol that may contain a group.""" + _, symbol = split_symbol(treeherder_symbol) + return join_symbol(new_group, symbol) + + +def inherit_treeherder_from_dep(job, dep_job): + """Inherit treeherder defaults from dep_job""" + treeherder = job.get("treeherder", {}) + + dep_th_platform = ( + dep_job.task.get("extra", {}) + .get("treeherder", {}) + .get("machine", {}) + .get("platform", "") + ) + dep_th_collection = list( + dep_job.task.get("extra", {}).get("treeherder", {}).get("collection", {}).keys() + )[0] + treeherder.setdefault("platform", f"{dep_th_platform}/{dep_th_collection}") + treeherder.setdefault( + "tier", dep_job.task.get("extra", {}).get("treeherder", {}).get("tier", 1) + ) + # Does not set symbol + treeherder.setdefault("kind", "build") + return treeherder + + +def treeherder_defaults(kind, label): + defaults = { + # Despite its name, this is expected to be a platform+collection + "platform": "default/opt", + "tier": 1, + } + if "build" in kind: + defaults["kind"] = "build" + elif "test" in kind: + defaults["kind"] = "test" + else: + defaults["kind"] = "other" + + # Takes the uppercased first letter of each part of the kind name, eg: + # apple-banana -> AB + defaults["symbol"] = "".join([c[0] for c in kind.split("-")]).upper() + + return defaults diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py new file mode 100644 index 0000000000..2d967d2645 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py @@ -0,0 +1,552 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import os +import re +import subprocess +from abc import ABC, abstractmethod, abstractproperty +from shutil import which + +import requests +from redo import retry + +from taskgraph.util.path import ancestors + +PUSHLOG_TMPL = "{}/json-pushes?version=2&changeset={}&tipsonly=1&full=1" + +logger = logging.getLogger(__name__) + + +class Repository(ABC): + # Both mercurial and git use sha1 as revision idenfiers. Luckily, both define + # the same value as the null revision. + # + # https://github.com/git/git/blob/dc04167d378fb29d30e1647ff6ff51dd182bc9a3/t/oid-info/hash-info#L7 + # https://www.mercurial-scm.org/repo/hg-stable/file/82efc31bd152/mercurial/node.py#l30 + NULL_REVISION = "0000000000000000000000000000000000000000" + + def __init__(self, path): + self.path = path + self.binary = which(self.tool) + if self.binary is None: + raise OSError(f"{self.tool} not found!") + self._valid_diff_filter = ("m", "a", "d") + + self._env = os.environ.copy() + + def run(self, *args: str, **kwargs): + return_codes = kwargs.pop("return_codes", []) + cmd = (self.binary,) + args + + try: + return subprocess.check_output( + cmd, cwd=self.path, env=self._env, encoding="utf-8", **kwargs + ) + except subprocess.CalledProcessError as e: + if e.returncode in return_codes: + return "" + raise + + @abstractproperty + def tool(self) -> str: + """Version control system being used, either 'hg' or 'git'.""" + + @abstractproperty + def head_rev(self) -> str: + """Hash of HEAD revision.""" + + @abstractproperty + def base_rev(self): + """Hash of revision the current topic branch is based on.""" + + @abstractproperty + def branch(self): + """Current branch or bookmark the checkout has active.""" + + @abstractproperty + def all_remote_names(self): + """Name of all configured remote repositories.""" + + @abstractproperty + def default_remote_name(self): + """Name the VCS defines for the remote repository when cloning + it for the first time. This name may not exist anymore if users + changed the default configuration, for instance.""" + + @abstractproperty + def remote_name(self): + """Name of the remote repository.""" + + def _get_most_suitable_remote(self, remote_instructions): + remotes = self.all_remote_names + if len(remotes) == 1: + return remotes[0] + + if self.default_remote_name in remotes: + return self.default_remote_name + + first_remote = remotes[0] + logger.warning( + f"Unable to determine which remote repository to use between: {remotes}. " + f'Arbitrarily using the first one "{first_remote}". Please set an ' + f"`{self.default_remote_name}` remote if the arbitrarily selected one " + f"is not right. To do so: {remote_instructions}" + ) + + return first_remote + + @abstractproperty + def default_branch(self): + """Name of the default branch.""" + + @abstractmethod + def get_url(self, remote=None): + """Get URL of the upstream repository.""" + + @abstractmethod + def get_commit_message(self, revision=None): + """Commit message of specified revision or current commit.""" + + @abstractmethod + def get_changed_files(self, diff_filter, mode="unstaged", rev=None, base_rev=None): + """Return a list of files that are changed in: + * either this repository's working copy, + * or at a given revision (``rev``) + * or between 2 revisions (``base_rev`` and ``rev``) + + ``diff_filter`` controls which kinds of modifications are returned. + It is a string which may only contain the following characters: + + A - Include files that were added + D - Include files that were deleted + M - Include files that were modified + + By default, all three will be included. + + ``mode`` can be one of 'unstaged', 'staged' or 'all'. Only has an + effect on git. Defaults to 'unstaged'. + + ``rev`` is a specifier for which changesets to consider for + changes. The exact meaning depends on the vcs system being used. + + ``base_rev`` specifies the range of changesets. This parameter cannot + be used without ``rev``. The range includes ``rev`` but excludes + ``base_rev``. + """ + + @abstractmethod + def get_outgoing_files(self, diff_filter, upstream): + """Return a list of changed files compared to upstream. + + ``diff_filter`` works the same as `get_changed_files`. + ``upstream`` is a remote ref to compare against. If unspecified, + this will be determined automatically. If there is no remote ref, + a MissingUpstreamRepo exception will be raised. + """ + + @abstractmethod + def working_directory_clean(self, untracked=False, ignored=False): + """Determine if the working directory is free of modifications. + + Returns True if the working directory does not have any file + modifications. False otherwise. + + By default, untracked and ignored files are not considered. If + ``untracked`` or ``ignored`` are set, they influence the clean check + to factor these file classes into consideration. + """ + + @abstractmethod + def update(self, ref): + """Update the working directory to the specified reference.""" + + @abstractmethod + def find_latest_common_revision(self, base_ref_or_rev, head_rev): + """Find the latest revision that is common to both the given + ``head_rev`` and ``base_ref_or_rev``. + + If no common revision exists, ``Repository.NULL_REVISION`` will + be returned.""" + + @abstractmethod + def does_revision_exist_locally(self, revision): + """Check whether this revision exists in the local repository. + + If this function returns an unexpected value, then make sure + the revision was fetched from the remote repository.""" + + +class HgRepository(Repository): + tool = "hg" + default_remote_name = "default" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._env["HGPLAIN"] = "1" + + @property + def head_rev(self): + return self.run("log", "-r", ".", "-T", "{node}").strip() + + @property + def base_rev(self): + return self.run("log", "-r", "last(ancestors(.) and public())", "-T", "{node}") + + @property + def branch(self): + bookmarks_fn = os.path.join(self.path, ".hg", "bookmarks.current") + if os.path.exists(bookmarks_fn): + with open(bookmarks_fn) as f: + bookmark = f.read() + return bookmark or None + + return None + + @property + def all_remote_names(self): + remotes = self.run("paths", "--quiet").splitlines() + if not remotes: + raise RuntimeError("No remotes defined") + return remotes + + @property + def remote_name(self): + return self._get_most_suitable_remote( + "Edit .hg/hgrc and add:\n\n[paths]\ndefault = $URL", + ) + + @property + def default_branch(self): + # Mercurial recommends keeping "default" + # https://www.mercurial-scm.org/wiki/StandardBranching#Don.27t_use_a_name_other_than_default_for_your_main_development_branch + return "default" + + def get_url(self, remote="default"): + return self.run("path", "-T", "{url}", remote).strip() + + def get_commit_message(self, revision=None): + revision = revision or "." + return self.run("log", "-r", revision, "-T", "{desc}") + + def _format_diff_filter(self, diff_filter, for_status=False): + df = diff_filter.lower() + assert all(f in self._valid_diff_filter for f in df) + + # When looking at the changes in the working directory, the hg status + # command uses 'd' for files that have been deleted with a non-hg + # command, and 'r' for files that have been `hg rm`ed. Use both. + return df.replace("d", "dr") if for_status else df + + def _files_template(self, diff_filter): + template = "" + df = self._format_diff_filter(diff_filter) + if "a" in df: + template += "{file_adds % '{file}\\n'}" + if "d" in df: + template += "{file_dels % '{file}\\n'}" + if "m" in df: + template += "{file_mods % '{file}\\n'}" + return template + + def get_changed_files( + self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None + ): + if rev is None: + if base_rev is not None: + raise ValueError("Cannot specify `base_rev` without `rev`") + # Use --no-status to print just the filename. + df = self._format_diff_filter(diff_filter, for_status=True) + return self.run("status", "--no-status", f"-{df}").splitlines() + else: + template = self._files_template(diff_filter) + revision_argument = rev if base_rev is None else f"{base_rev}~-1::{rev}" + return self.run("log", "-r", revision_argument, "-T", template).splitlines() + + def get_outgoing_files(self, diff_filter="ADM", upstream=None): + template = self._files_template(diff_filter) + + if not upstream: + return self.run( + "log", "-r", "draft() and ancestors(.)", "--template", template + ).split() + + return self.run( + "outgoing", + "-r", + ".", + "--quiet", + "--template", + template, + upstream, + return_codes=(1,), + ).split() + + def working_directory_clean(self, untracked=False, ignored=False): + args = ["status", "--modified", "--added", "--removed", "--deleted"] + if untracked: + args.append("--unknown") + if ignored: + args.append("--ignored") + + # If output is empty, there are no entries of requested status, which + # means we are clean. + return not len(self.run(*args).strip()) + + def update(self, ref): + return self.run("update", "--check", ref) + + def find_latest_common_revision(self, base_ref_or_rev, head_rev): + ancestor = self.run( + "log", + "-r", + f"last(ancestors('{base_ref_or_rev}') and ancestors('{head_rev}'))", + "--template", + "{node}", + ).strip() + return ancestor or self.NULL_REVISION + + def does_revision_exist_locally(self, revision): + try: + return bool(self.run("log", "-r", revision).strip()) + except subprocess.CalledProcessError as e: + # Error code 255 comes with the message: + # "abort: unknown revision $REVISION" + if e.returncode == 255: + return False + raise + + +class GitRepository(Repository): + tool = "git" + default_remote_name = "origin" + + _LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P<branch_name>\S+)\s+HEAD") + + @property + def head_rev(self): + return self.run("rev-parse", "--verify", "HEAD").strip() + + @property + def base_rev(self): + refs = self.run( + "rev-list", "HEAD", "--topo-order", "--boundary", "--not", "--remotes" + ).splitlines() + if refs: + return refs[-1][1:] # boundary starts with a prefix `-` + return self.head_rev + + @property + def branch(self): + return self.run("branch", "--show-current").strip() or None + + @property + def all_remote_names(self): + remotes = self.run("remote").splitlines() + if not remotes: + raise RuntimeError("No remotes defined") + return remotes + + @property + def remote_name(self): + try: + remote_branch_name = self.run( + "rev-parse", + "--verify", + "--abbrev-ref", + "--symbolic-full-name", + "@{u}", + stderr=subprocess.PIPE, + ).strip() + return remote_branch_name.split("/")[0] + except subprocess.CalledProcessError as e: + # Error code 128 comes with the message: + # "fatal: no upstream configured for branch $BRANCH" + if e.returncode != 128: + print(e.stderr) + raise + + return self._get_most_suitable_remote("`git remote add origin $URL`") + + @property + def default_branch(self): + try: + # this one works if the current repo was cloned from an existing + # repo elsewhere + return self._get_default_branch_from_cloned_metadata() + except (subprocess.CalledProcessError, RuntimeError): + pass + + try: + # This call works if you have (network) access to the repo + return self._get_default_branch_from_remote_query() + except (subprocess.CalledProcessError, RuntimeError): + pass + + # this one is the last resort in case the remote is not accessible and + # the local repo is where `git init` was made + return self._guess_default_branch() + + def _get_default_branch_from_remote_query(self): + # This function requires network access to the repo + remote_name = self.remote_name + output = self.run("ls-remote", "--symref", remote_name, "HEAD") + matches = self._LS_REMOTE_PATTERN.search(output) + if not matches: + raise RuntimeError( + f'Could not find the default branch of remote repository "{remote_name}". ' + "Got: {output}" + ) + + branch_name = matches.group("branch_name") + return f"{remote_name}/{branch_name}" + + def _get_default_branch_from_cloned_metadata(self): + return self.run("rev-parse", "--abbrev-ref", f"{self.remote_name}/HEAD").strip() + + def _guess_default_branch(self): + branches = [ + line.strip() + for line in self.run( + "branch", "--all", "--no-color", "--format=%(refname)" + ).splitlines() + for candidate_branch in ("main", "master", "branches/default/tip") + if line.strip().endswith(candidate_branch) + ] + + if len(branches) == 1: + return branches[0] + + raise RuntimeError(f"Unable to find default branch. Got: {branches}") + + def get_url(self, remote="origin"): + return self.run("remote", "get-url", remote).strip() + + def get_commit_message(self, revision=None): + revision = revision or "HEAD" + return self.run("log", "-n1", "--format=%B", revision) + + def get_changed_files( + self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None + ): + assert all(f.lower() in self._valid_diff_filter for f in diff_filter) + + if rev is None: + if base_rev is not None: + raise ValueError("Cannot specify `base_rev` without `rev`") + cmd = ["diff"] + if mode == "staged": + cmd.append("--cached") + elif mode == "all": + cmd.append("HEAD") + else: + revision_argument = ( + f"{rev}~1..{rev}" if base_rev is None else f"{base_rev}..{rev}" + ) + cmd = ["log", "--format=format:", revision_argument] + + cmd.append("--name-only") + cmd.append("--diff-filter=" + diff_filter.upper()) + + files = self.run(*cmd).splitlines() + return [f for f in files if f] + + def get_outgoing_files(self, diff_filter="ADM", upstream=None): + assert all(f.lower() in self._valid_diff_filter for f in diff_filter) + + not_condition = upstream if upstream else "--remotes" + + files = self.run( + "log", + "--name-only", + f"--diff-filter={diff_filter.upper()}", + "--oneline", + "--pretty=format:", + "HEAD", + "--not", + not_condition, + ).splitlines() + return [f for f in files if f] + + def working_directory_clean(self, untracked=False, ignored=False): + args = ["status", "--porcelain"] + + # Even in --porcelain mode, behavior is affected by the + # ``status.showUntrackedFiles`` option, which means we need to be + # explicit about how to treat untracked files. + if untracked: + args.append("--untracked-files=all") + else: + args.append("--untracked-files=no") + + if ignored: + args.append("--ignored") + + # If output is empty, there are no entries of requested status, which + # means we are clean. + return not len(self.run(*args).strip()) + + def update(self, ref): + self.run("checkout", ref) + + def find_latest_common_revision(self, base_ref_or_rev, head_rev): + try: + return self.run("merge-base", base_ref_or_rev, head_rev).strip() + except subprocess.CalledProcessError: + return self.NULL_REVISION + + def does_revision_exist_locally(self, revision): + try: + return self.run("cat-file", "-t", revision).strip() == "commit" + except subprocess.CalledProcessError as e: + # Error code 128 comes with the message: + # "git cat-file: could not get object info" + if e.returncode == 128: + return False + raise + + +def get_repository(path): + """Get a repository object for the repository at `path`. + If `path` is not a known VCS repository, raise an exception. + """ + for path in ancestors(path): + if os.path.isdir(os.path.join(path, ".hg")): + return HgRepository(path) + elif os.path.exists(os.path.join(path, ".git")): + return GitRepository(path) + + raise RuntimeError("Current directory is neither a git or hg repository") + + +def find_hg_revision_push_info(repository, revision): + """Given the parameters for this action and a revision, find the + pushlog_id of the revision.""" + pushlog_url = PUSHLOG_TMPL.format(repository, revision) + + def query_pushlog(url): + r = requests.get(pushlog_url, timeout=60) + r.raise_for_status() + return r + + r = retry( + query_pushlog, + args=(pushlog_url,), + attempts=5, + sleeptime=10, + ) + pushes = r.json()["pushes"] + if len(pushes) != 1: + raise RuntimeError( + "Unable to find a single pushlog_id for {} revision {}: {}".format( + repository, revision, pushes + ) + ) + pushid = list(pushes.keys())[0] + return { + "pushdate": pushes[pushid]["date"], + "pushid": pushid, + "user": pushes[pushid]["user"], + } diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/verify.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/verify.py new file mode 100644 index 0000000000..e6705c16cf --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/verify.py @@ -0,0 +1,283 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import sys +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Callable, Dict, List, Union + +from taskgraph.config import GraphConfig +from taskgraph.parameters import Parameters +from taskgraph.taskgraph import TaskGraph +from taskgraph.util.attributes import match_run_on_projects +from taskgraph.util.treeherder import join_symbol + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class Verification(ABC): + func: Callable + + @abstractmethod + def verify(self, **kwargs) -> None: + pass + + +@dataclass(frozen=True) +class InitialVerification(Verification): + """Verification that doesn't depend on any generation state.""" + + def verify(self): + self.func() + + +@dataclass(frozen=True) +class GraphVerification(Verification): + """Verification for a TaskGraph object.""" + + run_on_projects: Union[List, None] = field(default=None) + + def verify( + self, graph: TaskGraph, graph_config: GraphConfig, parameters: Parameters + ): + if self.run_on_projects and not match_run_on_projects( + parameters["project"], self.run_on_projects + ): + return + + scratch_pad = {} + graph.for_each_task( + self.func, + scratch_pad=scratch_pad, + graph_config=graph_config, + parameters=parameters, + ) + self.func( + None, + graph, + scratch_pad=scratch_pad, + graph_config=graph_config, + parameters=parameters, + ) + + +@dataclass(frozen=True) +class ParametersVerification(Verification): + """Verification for a set of parameters.""" + + def verify(self, parameters: Parameters): + self.func(parameters) + + +@dataclass(frozen=True) +class KindsVerification(Verification): + """Verification for kinds.""" + + def verify(self, kinds: dict): + self.func(kinds) + + +@dataclass(frozen=True) +class VerificationSequence: + """ + Container for a sequence of verifications over a TaskGraph. Each + verification is represented as a callable taking (task, taskgraph, + scratch_pad), called for each task in the taskgraph, and one more + time with no task but with the taskgraph and the same scratch_pad + that was passed for each task. + """ + + _verifications: Dict = field(default_factory=dict) + _verification_types = { + "graph": GraphVerification, + "initial": InitialVerification, + "kinds": KindsVerification, + "parameters": ParametersVerification, + } + + def __call__(self, name, *args, **kwargs): + for verification in self._verifications.get(name, []): + verification.verify(*args, **kwargs) + + def add(self, name, **kwargs): + cls = self._verification_types.get(name, GraphVerification) + + def wrap(func): + self._verifications.setdefault(name, []).append(cls(func, **kwargs)) + return func + + return wrap + + +verifications = VerificationSequence() + + +@verifications.add("full_task_graph") +def verify_task_graph_symbol(task, taskgraph, scratch_pad, graph_config, parameters): + """ + This function verifies that tuple + (collection.keys(), machine.platform, groupSymbol, symbol) is unique + for a target task graph. + """ + if task is None: + return + task_dict = task.task + if "extra" in task_dict: + extra = task_dict["extra"] + if "treeherder" in extra: + treeherder = extra["treeherder"] + + collection_keys = tuple(sorted(treeherder.get("collection", {}).keys())) + if len(collection_keys) != 1: + raise Exception( + "Task {} can't be in multiple treeherder collections " + "(the part of the platform after `/`): {}".format( + task.label, collection_keys + ) + ) + platform = treeherder.get("machine", {}).get("platform") + group_symbol = treeherder.get("groupSymbol") + symbol = treeherder.get("symbol") + + key = (platform, collection_keys[0], group_symbol, symbol) + if key in scratch_pad: + raise Exception( + "Duplicate treeherder platform and symbol in tasks " + "`{}`and `{}`: {} {}".format( + task.label, + scratch_pad[key], + f"{platform}/{collection_keys[0]}", + join_symbol(group_symbol, symbol), + ) + ) + else: + scratch_pad[key] = task.label + + +@verifications.add("full_task_graph") +def verify_trust_domain_v2_routes( + task, taskgraph, scratch_pad, graph_config, parameters +): + """ + This function ensures that any two tasks have distinct ``index.{trust-domain}.v2`` routes. + """ + if task is None: + return + route_prefix = "index.{}.v2".format(graph_config["trust-domain"]) + task_dict = task.task + routes = task_dict.get("routes", []) + + for route in routes: + if route.startswith(route_prefix): + if route in scratch_pad: + raise Exception( + "conflict between {}:{} for route: {}".format( + task.label, scratch_pad[route], route + ) + ) + else: + scratch_pad[route] = task.label + + +@verifications.add("full_task_graph") +def verify_routes_notification_filters( + task, taskgraph, scratch_pad, graph_config, parameters +): + """ + This function ensures that only understood filters for notifications are + specified. + + See: https://docs.taskcluster.net/reference/core/taskcluster-notify/docs/usage + """ + if task is None: + return + route_prefix = "notify." + valid_filters = ("on-any", "on-completed", "on-failed", "on-exception") + task_dict = task.task + routes = task_dict.get("routes", []) + + for route in routes: + if route.startswith(route_prefix): + # Get the filter of the route + route_filter = route.split(".")[-1] + if route_filter not in valid_filters: + raise Exception( + "{} has invalid notification filter ({})".format( + task.label, route_filter + ) + ) + + +@verifications.add("full_task_graph") +def verify_dependency_tiers(task, taskgraph, scratch_pad, graph_config, parameters): + tiers = scratch_pad + if task is not None: + tiers[task.label] = ( + task.task.get("extra", {}).get("treeherder", {}).get("tier", sys.maxsize) + ) + else: + + def printable_tier(tier): + if tier == sys.maxsize: + return "unknown" + return tier + + for task in taskgraph.tasks.values(): + tier = tiers[task.label] + for d in task.dependencies.values(): + if taskgraph[d].task.get("workerType") == "always-optimized": + continue + if "dummy" in taskgraph[d].kind: + continue + if tier < tiers[d]: + raise Exception( + "{} (tier {}) cannot depend on {} (tier {})".format( + task.label, + printable_tier(tier), + d, + printable_tier(tiers[d]), + ) + ) + + +@verifications.add("full_task_graph") +def verify_toolchain_alias(task, taskgraph, scratch_pad, graph_config, parameters): + """ + This function verifies that toolchain aliases are not reused. + """ + if task is None: + return + attributes = task.attributes + if "toolchain-alias" in attributes: + keys = attributes["toolchain-alias"] + if not keys: + keys = [] + elif isinstance(keys, str): + keys = [keys] + for key in keys: + if key in scratch_pad: + raise Exception( + "Duplicate toolchain-alias in tasks " + "`{}`and `{}`: {}".format( + task.label, + scratch_pad[key], + key, + ) + ) + else: + scratch_pad[key] = task.label + + +@verifications.add("optimized_task_graph") +def verify_always_optimized(task, taskgraph, scratch_pad, graph_config, parameters): + """ + This function ensures that always-optimized tasks have been optimized. + """ + if task is None: + return + if task.task.get("workerType") == "always-optimized": + raise Exception(f"Could not optimize the task {task.label!r}") diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/workertypes.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/workertypes.py new file mode 100644 index 0000000000..da39654d6b --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/workertypes.py @@ -0,0 +1,78 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from dataclasses import dataclass + +from .keyed_by import evaluate_keyed_by +from .memoize import memoize + + +@dataclass +class _BuiltinWorkerType: + provisioner: str + worker_type: str + + @property + def implementation(self): + """ + Since the list of built-in worker-types is small and fixed, we can get + away with punning the implementation name (in + `taskgraph.transforms.task`) and the worker_type. + """ + return self.worker_type + + +_BUILTIN_TYPES = { + "always-optimized": _BuiltinWorkerType("invalid", "always-optimized"), + "succeed": _BuiltinWorkerType("built-in", "succeed"), +} + + +@memoize +def worker_type_implementation(graph_config, worker_type): + """Get the worker implementation and OS for the given workerType, where the + OS represents the host system, not the target OS, in the case of + cross-compiles.""" + if worker_type in _BUILTIN_TYPES: + # For the built-in worker-types, we use an `implementation that matches + # the worker-type. + return _BUILTIN_TYPES[worker_type].implementation, None + worker_config = evaluate_keyed_by( + {"by-worker-type": graph_config["workers"]["aliases"]}, + "worker-types.yml", + {"worker-type": worker_type}, + ) + return worker_config["implementation"], worker_config.get("os") + + +@memoize +def get_worker_type(graph_config, alias, level): + """ + Get the worker type based, evaluating aliases from the graph config. + """ + if alias in _BUILTIN_TYPES: + builtin_type = _BUILTIN_TYPES[alias] + return builtin_type.provisioner, builtin_type.worker_type + + level = str(level) + worker_config = evaluate_keyed_by( + {"by-alias": graph_config["workers"]["aliases"]}, + "graph_config.workers.aliases", + {"alias": alias}, + ) + provisioner = evaluate_keyed_by( + worker_config["provisioner"], + alias, + {"level": level}, + ).format( + **{"alias": alias, "level": level, "trust-domain": graph_config["trust-domain"]} + ) + worker_type = evaluate_keyed_by( + worker_config["worker-type"], + alias, + {"level": level}, + ).format( + **{"alias": alias, "level": level, "trust-domain": graph_config["trust-domain"]} + ) + return provisioner, worker_type diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/yaml.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/yaml.py new file mode 100644 index 0000000000..141c7a16d3 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/yaml.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import os + +from yaml.loader import SafeLoader + + +class UnicodeLoader(SafeLoader): + def construct_yaml_str(self, node): + return self.construct_scalar(node) + + +UnicodeLoader.add_constructor("tag:yaml.org,2002:str", UnicodeLoader.construct_yaml_str) + + +def load_stream(stream): + """ + Parse the first YAML document in a stream + and produce the corresponding Python object. + """ + loader = UnicodeLoader(stream) + try: + return loader.get_single_data() + finally: + loader.dispose() + + +def load_yaml(*parts): + """Convenience function to load a YAML file in the given path. This is + useful for loading kind configuration files from the kind path.""" + filename = os.path.join(*parts) + with open(filename, "rb") as f: + return load_stream(f) |