1 files changed, 539 insertions, 0 deletions
diff --git a/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py b/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py
new file mode 100644
index 0000000000..ba1d909019
--- /dev/null
+++ b/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py
@@ -0,0 +1,539 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+import logging
+import os
+import re
+import subprocess
+from abc import ABC, abstractmethod, abstractproperty
+from shutil import which
+
+import requests
+from redo import retry
+
+from taskgraph.util.path import ancestors
+
+PUSHLOG_TMPL = "{}/json-pushes?version=2&changeset={}&tipsonly=1&full=1"
+
+logger = logging.getLogger(__name__)
+
+
+class Repository(ABC):
+    # Both mercurial and git use sha1 as revision idenfiers. Luckily, both define
+    # the same value as the null revision.
+    #
+    # https://github.com/git/git/blob/dc04167d378fb29d30e1647ff6ff51dd182bc9a3/t/oid-info/hash-info#L7
+    # https://www.mercurial-scm.org/repo/hg-stable/file/82efc31bd152/mercurial/node.py#l30
+    NULL_REVISION = "0000000000000000000000000000000000000000"
+
+    def __init__(self, path):
+        self.path = path
+        self.binary = which(self.tool)
+        if self.binary is None:
+            raise OSError(f"{self.tool} not found!")
+        self._valid_diff_filter = ("m", "a", "d")
+
+        self._env = os.environ.copy()
+
+    def run(self, *args: str, **kwargs):
+        return_codes = kwargs.pop("return_codes", [])
+        cmd = (self.binary,) + args
+
+        try:
+            return subprocess.check_output(
+                cmd, cwd=self.path, env=self._env, encoding="utf-8", **kwargs
+            )
+        except subprocess.CalledProcessError as e:
+            if e.returncode in return_codes:
+                return ""
+            raise
+
+    @abstractproperty
+    def tool(self) -> str:
+        """Version control system being used, either 'hg' or 'git'."""
+
+    @abstractproperty
+    def head_rev(self) -> str:
+        """Hash of HEAD revision."""
+
+    @abstractproperty
+    def base_rev(self):
+        """Hash of revision the current topic branch is based on."""
+
+    @abstractproperty
+    def branch(self):
+        """Current branch or bookmark the checkout has active."""
+
+    @abstractproperty
+    def all_remote_names(self):
+        """Name of all configured remote repositories."""
+
+    @abstractproperty
+    def default_remote_name(self):
+        """Name the VCS defines for the remote repository when cloning
+        it for the first time. This name may not exist anymore if users
+        changed the default configuration, for instance."""
+
+    @abstractproperty
+    def remote_name(self):
+        """Name of the remote repository."""
+
+    def _get_most_suitable_remote(self, remote_instructions):
+        remotes = self.all_remote_names
+        if len(remotes) == 1:
+            return remotes[0]
+
+        if self.default_remote_name in remotes:
+            return self.default_remote_name
+
+        first_remote = remotes[0]
+        logger.warning(
+            f"Unable to determine which remote repository to use between: {remotes}. "
+            f'Arbitrarily using the first one "{first_remote}". Please set an '
+            f"`{self.default_remote_name}` remote if the arbitrarily selected one "
+            f"is not right. To do so: {remote_instructions}"
+        )
+
+        return first_remote
+
+    @abstractproperty
+    def default_branch(self):
+        """Name of the default branch."""
+
+    @abstractmethod
+    def get_url(self, remote=None):
+        """Get URL of the upstream repository."""
+
+    @abstractmethod
+    def get_commit_message(self, revision=None):
+        """Commit message of specified revision or current commit."""
+
+    @abstractmethod
+    def get_changed_files(self, diff_filter, mode="unstaged", rev=None, base_rev=None):
+        """Return a list of files that are changed in:
+         * either this repository's working copy,
+         * or at a given revision (``rev``)
+         * or between 2 revisions (``base_rev`` and ``rev``)
+
+        ``diff_filter`` controls which kinds of modifications are returned.
+        It is a string which may only contain the following characters:
+
+            A - Include files that were added
+            D - Include files that were deleted
+            M - Include files that were modified
+
+        By default, all three will be included.
+
+        ``mode`` can be one of 'unstaged', 'staged' or 'all'. Only has an
+        effect on git. Defaults to 'unstaged'.
+
+        ``rev`` is a specifier for which changesets to consider for
+        changes. The exact meaning depends on the vcs system being used.
+
+        ``base_rev`` specifies the range of changesets. This parameter cannot
+        be used without ``rev``. The range includes ``rev`` but excludes
+        ``base_rev``.
+        """
+
+    @abstractmethod
+    def get_outgoing_files(self, diff_filter, upstream):
+        """Return a list of changed files compared to upstream.
+
+        ``diff_filter`` works the same as `get_changed_files`.
+        ``upstream`` is a remote ref to compare against. If unspecified,
+        this will be determined automatically. If there is no remote ref,
+        a MissingUpstreamRepo exception will be raised.
+        """
+
+    @abstractmethod
+    def working_directory_clean(self, untracked=False, ignored=False):
+        """Determine if the working directory is free of modifications.
+
+        Returns True if the working directory does not have any file
+        modifications. False otherwise.
+
+        By default, untracked and ignored files are not considered. If
+        ``untracked`` or ``ignored`` are set, they influence the clean check
+        to factor these file classes into consideration.
+        """
+
+    @abstractmethod
+    def update(self, ref):
+        """Update the working directory to the specified reference."""
+
+    @abstractmethod
+    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
+        """Find the latest revision that is common to both the given
+        ``head_rev`` and ``base_ref_or_rev``"""
+
+    @abstractmethod
+    def does_revision_exist_locally(self, revision):
+        """Check whether this revision exists in the local repository.
+
+        If this function returns an unexpected value, then make sure
+        the revision was fetched from the remote repository."""
+
+
+class HgRepository(Repository):
+    tool = "hg"
+    default_remote_name = "default"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._env["HGPLAIN"] = "1"
+
+    @property
+    def head_rev(self):
+        return self.run("log", "-r", ".", "-T", "{node}").strip()
+
+    @property
+    def base_rev(self):
+        return self.run("log", "-r", "last(ancestors(.) and public())", "-T", "{node}")
+
+    @property
+    def branch(self):
+        bookmarks_fn = os.path.join(self.path, ".hg", "bookmarks.current")
+        if os.path.exists(bookmarks_fn):
+            with open(bookmarks_fn) as f:
+                bookmark = f.read()
+                return bookmark or None
+
+        return None
+
+    @property
+    def all_remote_names(self):
+        remotes = self.run("paths", "--quiet").splitlines()
+        if not remotes:
+            raise RuntimeError("No remotes defined")
+        return remotes
+
+    @property
+    def remote_name(self):
+        return self._get_most_suitable_remote(
+            "Edit .hg/hgrc and add:\n\n[paths]\ndefault = $URL",
+        )
+
+    @property
+    def default_branch(self):
+        # Mercurial recommends keeping "default"
+        # https://www.mercurial-scm.org/wiki/StandardBranching#Don.27t_use_a_name_other_than_default_for_your_main_development_branch
+        return "default"
+
+    def get_url(self, remote="default"):
+        return self.run("path", "-T", "{url}", remote).strip()
+
+    def get_commit_message(self, revision=None):
+        revision = revision or self.head_rev
+        return self.run("log", "-r", ".", "-T", "{desc}")
+
+    def _format_diff_filter(self, diff_filter, for_status=False):
+        df = diff_filter.lower()
+        assert all(f in self._valid_diff_filter for f in df)
+
+        # When looking at the changes in the working directory, the hg status
+        # command uses 'd' for files that have been deleted with a non-hg
+        # command, and 'r' for files that have been `hg rm`ed. Use both.
+        return df.replace("d", "dr") if for_status else df
+
+    def _files_template(self, diff_filter):
+        template = ""
+        df = self._format_diff_filter(diff_filter)
+        if "a" in df:
+            template += "{file_adds % '{file}\\n'}"
+        if "d" in df:
+            template += "{file_dels % '{file}\\n'}"
+        if "m" in df:
+            template += "{file_mods % '{file}\\n'}"
+        return template
+
+    def get_changed_files(
+        self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None
+    ):
+        if rev is None:
+            if base_rev is not None:
+                raise ValueError("Cannot specify `base_rev` without `rev`")
+            # Use --no-status to print just the filename.
+            df = self._format_diff_filter(diff_filter, for_status=True)
+            return self.run("status", "--no-status", f"-{df}").splitlines()
+        else:
+            template = self._files_template(diff_filter)
+            revision_argument = rev if base_rev is None else f"{base_rev}~-1::{rev}"
+            return self.run("log", "-r", revision_argument, "-T", template).splitlines()
+
+    def get_outgoing_files(self, diff_filter="ADM", upstream=None):
+        template = self._files_template(diff_filter)
+
+        if not upstream:
+            return self.run(
+                "log", "-r", "draft() and ancestors(.)", "--template", template
+            ).split()
+
+        return self.run(
+            "outgoing",
+            "-r",
+            ".",
+            "--quiet",
+            "--template",
+            template,
+            upstream,
+            return_codes=(1,),
+        ).split()
+
+    def working_directory_clean(self, untracked=False, ignored=False):
+        args = ["status", "--modified", "--added", "--removed", "--deleted"]
+        if untracked:
+            args.append("--unknown")
+        if ignored:
+            args.append("--ignored")
+
+        # If output is empty, there are no entries of requested status, which
+        # means we are clean.
+        return not len(self.run(*args).strip())
+
+    def update(self, ref):
+        return self.run("update", "--check", ref)
+
+    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
+        return self.run(
+            "log",
+            "-r",
+            f"last(ancestors('{base_ref_or_rev}') and ancestors('{head_rev}'))",
+            "--template",
+            "{node}",
+        ).strip()
+
+    def does_revision_exist_locally(self, revision):
+        try:
+            return self.run("log", "-r", revision).strip() != ""
+        except subprocess.CalledProcessError as e:
+            # Error code 255 comes with the message:
+            # "abort: unknown revision $REVISION"
+            if e.returncode == 255:
+                return False
+            raise
+
+
+class GitRepository(Repository):
+    tool = "git"
+    default_remote_name = "origin"
+
+    _LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P<branch_name>\S+)\s+HEAD")
+
+    @property
+    def head_rev(self):
+        return self.run("rev-parse", "--verify", "HEAD").strip()
+
+    @property
+    def base_rev(self):
+        refs = self.run(
+            "rev-list", "HEAD", "--topo-order", "--boundary", "--not", "--remotes"
+        ).splitlines()
+        if refs:
+            return refs[-1][1:]  # boundary starts with a prefix `-`
+        return self.head_rev
+
+    @property
+    def branch(self):
+        return self.run("branch", "--show-current").strip() or None
+
+    @property
+    def all_remote_names(self):
+        remotes = self.run("remote").splitlines()
+        if not remotes:
+            raise RuntimeError("No remotes defined")
+        return remotes
+
+    @property
+    def remote_name(self):
+        try:
+            remote_branch_name = self.run(
+                "rev-parse", "--verify", "--abbrev-ref", "--symbolic-full-name", "@{u}"
+            ).strip()
+            return remote_branch_name.split("/")[0]
+        except subprocess.CalledProcessError as e:
+            # Error code 128 comes with the message:
+            # "fatal: no upstream configured for branch $BRANCH"
+            if e.returncode != 128:
+                raise
+
+        return self._get_most_suitable_remote("`git remote add origin $URL`")
+
+    @property
+    def default_branch(self):
+        try:
+            # this one works if the current repo was cloned from an existing
+            # repo elsewhere
+            return self._get_default_branch_from_cloned_metadata()
+        except (subprocess.CalledProcessError, RuntimeError):
+            pass
+
+        try:
+            # This call works if you have (network) access to the repo
+            return self._get_default_branch_from_remote_query()
+        except (subprocess.CalledProcessError, RuntimeError):
+            pass
+
+        # this one is the last resort in case the remote is not accessible and
+        # the local repo is where `git init` was made
+        return self._guess_default_branch()
+
+    def _get_default_branch_from_remote_query(self):
+        # This function requires network access to the repo
+        remote_name = self.remote_name
+        output = self.run("ls-remote", "--symref", remote_name, "HEAD")
+        matches = self._LS_REMOTE_PATTERN.search(output)
+        if not matches:
+            raise RuntimeError(
+                f'Could not find the default branch of remote repository "{remote_name}". '
+                "Got: {output}"
+            )
+
+        branch_name = matches.group("branch_name")
+        return f"{remote_name}/{branch_name}"
+
+    def _get_default_branch_from_cloned_metadata(self):
+        return self.run("rev-parse", "--abbrev-ref", f"{self.remote_name}/HEAD").strip()
+
+    def _guess_default_branch(self):
+        branches = [
+            line.strip()
+            for line in self.run(
+                "branch", "--all", "--no-color", "--format=%(refname)"
+            ).splitlines()
+            for candidate_branch in ("main", "master", "branches/default/tip")
+            if line.strip().endswith(candidate_branch)
+        ]
+
+        if len(branches) == 1:
+            return branches[0]
+
+        raise RuntimeError(f"Unable to find default branch. Got: {branches}")
+
+    def get_url(self, remote="origin"):
+        return self.run("remote", "get-url", remote).strip()
+
+    def get_commit_message(self, revision=None):
+        revision = revision or self.head_rev
+        return self.run("log", "-n1", "--format=%B")
+
+    def get_changed_files(
+        self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None
+    ):
+        assert all(f.lower() in self._valid_diff_filter for f in diff_filter)
+
+        if rev is None:
+            if base_rev is not None:
+                raise ValueError("Cannot specify `base_rev` without `rev`")
+            cmd = ["diff"]
+            if mode == "staged":
+                cmd.append("--cached")
+            elif mode == "all":
+                cmd.append("HEAD")
+        else:
+            revision_argument = (
+                f"{rev}~1..{rev}" if base_rev is None else f"{base_rev}..{rev}"
+            )
+            cmd = ["log", "--format=format:", revision_argument]
+
+        cmd.append("--name-only")
+        cmd.append("--diff-filter=" + diff_filter.upper())
+
+        files = self.run(*cmd).splitlines()
+        return [f for f in files if f]
+
+    def get_outgoing_files(self, diff_filter="ADM", upstream=None):
+        assert all(f.lower() in self._valid_diff_filter for f in diff_filter)
+
+        not_condition = upstream if upstream else "--remotes"
+
+        files = self.run(
+            "log",
+            "--name-only",
+            f"--diff-filter={diff_filter.upper()}",
+            "--oneline",
+            "--pretty=format:",
+            "HEAD",
+            "--not",
+            not_condition,
+        ).splitlines()
+        return [f for f in files if f]
+
+    def working_directory_clean(self, untracked=False, ignored=False):
+        args = ["status", "--porcelain"]
+
+        # Even in --porcelain mode, behavior is affected by the
+        # ``status.showUntrackedFiles`` option, which means we need to be
+        # explicit about how to treat untracked files.
+        if untracked:
+            args.append("--untracked-files=all")
+        else:
+            args.append("--untracked-files=no")
+
+        if ignored:
+            args.append("--ignored")
+
+        # If output is empty, there are no entries of requested status, which
+        # means we are clean.
+        return not len(self.run(*args).strip())
+
+    def update(self, ref):
+        self.run("checkout", ref)
+
+    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
+        return self.run("merge-base", base_ref_or_rev, head_rev).strip()
+
+    def does_revision_exist_locally(self, revision):
+        try:
+            return self.run("cat-file", "-t", revision).strip() == "commit"
+        except subprocess.CalledProcessError as e:
+            # Error code 128 comes with the message:
+            # "git cat-file: could not get object info"
+            if e.returncode == 128:
+                return False
+            raise
+
+
+def get_repository(path):
+    """Get a repository object for the repository at `path`.
+    If `path` is not a known VCS repository, raise an exception.
+    """
+    for path in ancestors(path):
+        if os.path.isdir(os.path.join(path, ".hg")):
+            return HgRepository(path)
+        elif os.path.exists(os.path.join(path, ".git")):
+            return GitRepository(path)
+
+    raise RuntimeError("Current directory is neither a git or hg repository")
+
+
+def find_hg_revision_push_info(repository, revision):
+    """Given the parameters for this action and a revision, find the
+    pushlog_id of the revision."""
+    pushlog_url = PUSHLOG_TMPL.format(repository, revision)
+
+    def query_pushlog(url):
+        r = requests.get(pushlog_url, timeout=60)
+        r.raise_for_status()
+        return r
+
+    r = retry(
+        query_pushlog,
+        args=(pushlog_url,),
+        attempts=5,
+        sleeptime=10,
+    )
+    pushes = r.json()["pushes"]
+    if len(pushes) != 1:
+        raise RuntimeError(
+            "Unable to find a single pushlog_id for {} revision {}: {}".format(
+                repository, revision, pushes
+            )
+        )
+    pushid = list(pushes.keys())[0]
+    return {
+        "pushdate": pushes[pushid]["date"],
+        "pushid": pushid,
+        "user": pushes[pushid]["user"],
+    }