diff options
Diffstat (limited to 'third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py')
-rw-r--r-- | third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py | 539 |
1 files changed, 539 insertions, 0 deletions
diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py new file mode 100644 index 0000000000..ba1d909019 --- /dev/null +++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py @@ -0,0 +1,539 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import os +import re +import subprocess +from abc import ABC, abstractmethod, abstractproperty +from shutil import which + +import requests +from redo import retry + +from taskgraph.util.path import ancestors + +PUSHLOG_TMPL = "{}/json-pushes?version=2&changeset={}&tipsonly=1&full=1" + +logger = logging.getLogger(__name__) + + +class Repository(ABC): + # Both mercurial and git use sha1 as revision idenfiers. Luckily, both define + # the same value as the null revision. + # + # https://github.com/git/git/blob/dc04167d378fb29d30e1647ff6ff51dd182bc9a3/t/oid-info/hash-info#L7 + # https://www.mercurial-scm.org/repo/hg-stable/file/82efc31bd152/mercurial/node.py#l30 + NULL_REVISION = "0000000000000000000000000000000000000000" + + def __init__(self, path): + self.path = path + self.binary = which(self.tool) + if self.binary is None: + raise OSError(f"{self.tool} not found!") + self._valid_diff_filter = ("m", "a", "d") + + self._env = os.environ.copy() + + def run(self, *args: str, **kwargs): + return_codes = kwargs.pop("return_codes", []) + cmd = (self.binary,) + args + + try: + return subprocess.check_output( + cmd, cwd=self.path, env=self._env, encoding="utf-8", **kwargs + ) + except subprocess.CalledProcessError as e: + if e.returncode in return_codes: + return "" + raise + + @abstractproperty + def tool(self) -> str: + """Version control system being used, either 'hg' or 'git'.""" + + @abstractproperty + def head_rev(self) -> str: + """Hash of HEAD revision.""" + + @abstractproperty + def base_rev(self): + """Hash of revision the current topic branch is based on.""" + + @abstractproperty + def branch(self): + """Current branch or bookmark the checkout has active.""" + + @abstractproperty + def all_remote_names(self): + """Name of all configured remote repositories.""" + + @abstractproperty + def default_remote_name(self): + """Name the VCS defines for the remote repository when cloning + it for the first time. This name may not exist anymore if users + changed the default configuration, for instance.""" + + @abstractproperty + def remote_name(self): + """Name of the remote repository.""" + + def _get_most_suitable_remote(self, remote_instructions): + remotes = self.all_remote_names + if len(remotes) == 1: + return remotes[0] + + if self.default_remote_name in remotes: + return self.default_remote_name + + first_remote = remotes[0] + logger.warning( + f"Unable to determine which remote repository to use between: {remotes}. " + f'Arbitrarily using the first one "{first_remote}". Please set an ' + f"`{self.default_remote_name}` remote if the arbitrarily selected one " + f"is not right. To do so: {remote_instructions}" + ) + + return first_remote + + @abstractproperty + def default_branch(self): + """Name of the default branch.""" + + @abstractmethod + def get_url(self, remote=None): + """Get URL of the upstream repository.""" + + @abstractmethod + def get_commit_message(self, revision=None): + """Commit message of specified revision or current commit.""" + + @abstractmethod + def get_changed_files(self, diff_filter, mode="unstaged", rev=None, base_rev=None): + """Return a list of files that are changed in: + * either this repository's working copy, + * or at a given revision (``rev``) + * or between 2 revisions (``base_rev`` and ``rev``) + + ``diff_filter`` controls which kinds of modifications are returned. + It is a string which may only contain the following characters: + + A - Include files that were added + D - Include files that were deleted + M - Include files that were modified + + By default, all three will be included. + + ``mode`` can be one of 'unstaged', 'staged' or 'all'. Only has an + effect on git. Defaults to 'unstaged'. + + ``rev`` is a specifier for which changesets to consider for + changes. The exact meaning depends on the vcs system being used. + + ``base_rev`` specifies the range of changesets. This parameter cannot + be used without ``rev``. The range includes ``rev`` but excludes + ``base_rev``. + """ + + @abstractmethod + def get_outgoing_files(self, diff_filter, upstream): + """Return a list of changed files compared to upstream. + + ``diff_filter`` works the same as `get_changed_files`. + ``upstream`` is a remote ref to compare against. If unspecified, + this will be determined automatically. If there is no remote ref, + a MissingUpstreamRepo exception will be raised. + """ + + @abstractmethod + def working_directory_clean(self, untracked=False, ignored=False): + """Determine if the working directory is free of modifications. + + Returns True if the working directory does not have any file + modifications. False otherwise. + + By default, untracked and ignored files are not considered. If + ``untracked`` or ``ignored`` are set, they influence the clean check + to factor these file classes into consideration. + """ + + @abstractmethod + def update(self, ref): + """Update the working directory to the specified reference.""" + + @abstractmethod + def find_latest_common_revision(self, base_ref_or_rev, head_rev): + """Find the latest revision that is common to both the given + ``head_rev`` and ``base_ref_or_rev``""" + + @abstractmethod + def does_revision_exist_locally(self, revision): + """Check whether this revision exists in the local repository. + + If this function returns an unexpected value, then make sure + the revision was fetched from the remote repository.""" + + +class HgRepository(Repository): + tool = "hg" + default_remote_name = "default" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._env["HGPLAIN"] = "1" + + @property + def head_rev(self): + return self.run("log", "-r", ".", "-T", "{node}").strip() + + @property + def base_rev(self): + return self.run("log", "-r", "last(ancestors(.) and public())", "-T", "{node}") + + @property + def branch(self): + bookmarks_fn = os.path.join(self.path, ".hg", "bookmarks.current") + if os.path.exists(bookmarks_fn): + with open(bookmarks_fn) as f: + bookmark = f.read() + return bookmark or None + + return None + + @property + def all_remote_names(self): + remotes = self.run("paths", "--quiet").splitlines() + if not remotes: + raise RuntimeError("No remotes defined") + return remotes + + @property + def remote_name(self): + return self._get_most_suitable_remote( + "Edit .hg/hgrc and add:\n\n[paths]\ndefault = $URL", + ) + + @property + def default_branch(self): + # Mercurial recommends keeping "default" + # https://www.mercurial-scm.org/wiki/StandardBranching#Don.27t_use_a_name_other_than_default_for_your_main_development_branch + return "default" + + def get_url(self, remote="default"): + return self.run("path", "-T", "{url}", remote).strip() + + def get_commit_message(self, revision=None): + revision = revision or self.head_rev + return self.run("log", "-r", ".", "-T", "{desc}") + + def _format_diff_filter(self, diff_filter, for_status=False): + df = diff_filter.lower() + assert all(f in self._valid_diff_filter for f in df) + + # When looking at the changes in the working directory, the hg status + # command uses 'd' for files that have been deleted with a non-hg + # command, and 'r' for files that have been `hg rm`ed. Use both. + return df.replace("d", "dr") if for_status else df + + def _files_template(self, diff_filter): + template = "" + df = self._format_diff_filter(diff_filter) + if "a" in df: + template += "{file_adds % '{file}\\n'}" + if "d" in df: + template += "{file_dels % '{file}\\n'}" + if "m" in df: + template += "{file_mods % '{file}\\n'}" + return template + + def get_changed_files( + self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None + ): + if rev is None: + if base_rev is not None: + raise ValueError("Cannot specify `base_rev` without `rev`") + # Use --no-status to print just the filename. + df = self._format_diff_filter(diff_filter, for_status=True) + return self.run("status", "--no-status", f"-{df}").splitlines() + else: + template = self._files_template(diff_filter) + revision_argument = rev if base_rev is None else f"{base_rev}~-1::{rev}" + return self.run("log", "-r", revision_argument, "-T", template).splitlines() + + def get_outgoing_files(self, diff_filter="ADM", upstream=None): + template = self._files_template(diff_filter) + + if not upstream: + return self.run( + "log", "-r", "draft() and ancestors(.)", "--template", template + ).split() + + return self.run( + "outgoing", + "-r", + ".", + "--quiet", + "--template", + template, + upstream, + return_codes=(1,), + ).split() + + def working_directory_clean(self, untracked=False, ignored=False): + args = ["status", "--modified", "--added", "--removed", "--deleted"] + if untracked: + args.append("--unknown") + if ignored: + args.append("--ignored") + + # If output is empty, there are no entries of requested status, which + # means we are clean. + return not len(self.run(*args).strip()) + + def update(self, ref): + return self.run("update", "--check", ref) + + def find_latest_common_revision(self, base_ref_or_rev, head_rev): + return self.run( + "log", + "-r", + f"last(ancestors('{base_ref_or_rev}') and ancestors('{head_rev}'))", + "--template", + "{node}", + ).strip() + + def does_revision_exist_locally(self, revision): + try: + return self.run("log", "-r", revision).strip() != "" + except subprocess.CalledProcessError as e: + # Error code 255 comes with the message: + # "abort: unknown revision $REVISION" + if e.returncode == 255: + return False + raise + + +class GitRepository(Repository): + tool = "git" + default_remote_name = "origin" + + _LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P<branch_name>\S+)\s+HEAD") + + @property + def head_rev(self): + return self.run("rev-parse", "--verify", "HEAD").strip() + + @property + def base_rev(self): + refs = self.run( + "rev-list", "HEAD", "--topo-order", "--boundary", "--not", "--remotes" + ).splitlines() + if refs: + return refs[-1][1:] # boundary starts with a prefix `-` + return self.head_rev + + @property + def branch(self): + return self.run("branch", "--show-current").strip() or None + + @property + def all_remote_names(self): + remotes = self.run("remote").splitlines() + if not remotes: + raise RuntimeError("No remotes defined") + return remotes + + @property + def remote_name(self): + try: + remote_branch_name = self.run( + "rev-parse", "--verify", "--abbrev-ref", "--symbolic-full-name", "@{u}" + ).strip() + return remote_branch_name.split("/")[0] + except subprocess.CalledProcessError as e: + # Error code 128 comes with the message: + # "fatal: no upstream configured for branch $BRANCH" + if e.returncode != 128: + raise + + return self._get_most_suitable_remote("`git remote add origin $URL`") + + @property + def default_branch(self): + try: + # this one works if the current repo was cloned from an existing + # repo elsewhere + return self._get_default_branch_from_cloned_metadata() + except (subprocess.CalledProcessError, RuntimeError): + pass + + try: + # This call works if you have (network) access to the repo + return self._get_default_branch_from_remote_query() + except (subprocess.CalledProcessError, RuntimeError): + pass + + # this one is the last resort in case the remote is not accessible and + # the local repo is where `git init` was made + return self._guess_default_branch() + + def _get_default_branch_from_remote_query(self): + # This function requires network access to the repo + remote_name = self.remote_name + output = self.run("ls-remote", "--symref", remote_name, "HEAD") + matches = self._LS_REMOTE_PATTERN.search(output) + if not matches: + raise RuntimeError( + f'Could not find the default branch of remote repository "{remote_name}". ' + "Got: {output}" + ) + + branch_name = matches.group("branch_name") + return f"{remote_name}/{branch_name}" + + def _get_default_branch_from_cloned_metadata(self): + return self.run("rev-parse", "--abbrev-ref", f"{self.remote_name}/HEAD").strip() + + def _guess_default_branch(self): + branches = [ + line.strip() + for line in self.run( + "branch", "--all", "--no-color", "--format=%(refname)" + ).splitlines() + for candidate_branch in ("main", "master", "branches/default/tip") + if line.strip().endswith(candidate_branch) + ] + + if len(branches) == 1: + return branches[0] + + raise RuntimeError(f"Unable to find default branch. Got: {branches}") + + def get_url(self, remote="origin"): + return self.run("remote", "get-url", remote).strip() + + def get_commit_message(self, revision=None): + revision = revision or self.head_rev + return self.run("log", "-n1", "--format=%B") + + def get_changed_files( + self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None + ): + assert all(f.lower() in self._valid_diff_filter for f in diff_filter) + + if rev is None: + if base_rev is not None: + raise ValueError("Cannot specify `base_rev` without `rev`") + cmd = ["diff"] + if mode == "staged": + cmd.append("--cached") + elif mode == "all": + cmd.append("HEAD") + else: + revision_argument = ( + f"{rev}~1..{rev}" if base_rev is None else f"{base_rev}..{rev}" + ) + cmd = ["log", "--format=format:", revision_argument] + + cmd.append("--name-only") + cmd.append("--diff-filter=" + diff_filter.upper()) + + files = self.run(*cmd).splitlines() + return [f for f in files if f] + + def get_outgoing_files(self, diff_filter="ADM", upstream=None): + assert all(f.lower() in self._valid_diff_filter for f in diff_filter) + + not_condition = upstream if upstream else "--remotes" + + files = self.run( + "log", + "--name-only", + f"--diff-filter={diff_filter.upper()}", + "--oneline", + "--pretty=format:", + "HEAD", + "--not", + not_condition, + ).splitlines() + return [f for f in files if f] + + def working_directory_clean(self, untracked=False, ignored=False): + args = ["status", "--porcelain"] + + # Even in --porcelain mode, behavior is affected by the + # ``status.showUntrackedFiles`` option, which means we need to be + # explicit about how to treat untracked files. + if untracked: + args.append("--untracked-files=all") + else: + args.append("--untracked-files=no") + + if ignored: + args.append("--ignored") + + # If output is empty, there are no entries of requested status, which + # means we are clean. + return not len(self.run(*args).strip()) + + def update(self, ref): + self.run("checkout", ref) + + def find_latest_common_revision(self, base_ref_or_rev, head_rev): + return self.run("merge-base", base_ref_or_rev, head_rev).strip() + + def does_revision_exist_locally(self, revision): + try: + return self.run("cat-file", "-t", revision).strip() == "commit" + except subprocess.CalledProcessError as e: + # Error code 128 comes with the message: + # "git cat-file: could not get object info" + if e.returncode == 128: + return False + raise + + +def get_repository(path): + """Get a repository object for the repository at `path`. + If `path` is not a known VCS repository, raise an exception. + """ + for path in ancestors(path): + if os.path.isdir(os.path.join(path, ".hg")): + return HgRepository(path) + elif os.path.exists(os.path.join(path, ".git")): + return GitRepository(path) + + raise RuntimeError("Current directory is neither a git or hg repository") + + +def find_hg_revision_push_info(repository, revision): + """Given the parameters for this action and a revision, find the + pushlog_id of the revision.""" + pushlog_url = PUSHLOG_TMPL.format(repository, revision) + + def query_pushlog(url): + r = requests.get(pushlog_url, timeout=60) + r.raise_for_status() + return r + + r = retry( + query_pushlog, + args=(pushlog_url,), + attempts=5, + sleeptime=10, + ) + pushes = r.json()["pushes"] + if len(pushes) != 1: + raise RuntimeError( + "Unable to find a single pushlog_id for {} revision {}: {}".format( + repository, revision, pushes + ) + ) + pushid = list(pushes.keys())[0] + return { + "pushdate": pushes[pushid]["date"], + "pushid": pushid, + "user": pushes[pushid]["user"], + } |