summaryrefslogtreecommitdiffstats
path: root/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py
blob: ba1d909019919b7ab9ad29d6ebbd622b59ad3414 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.


import logging
import os
import re
import subprocess
from abc import ABC, abstractmethod, abstractproperty
from shutil import which

import requests
from redo import retry

from taskgraph.util.path import ancestors

PUSHLOG_TMPL = "{}/json-pushes?version=2&changeset={}&tipsonly=1&full=1"

logger = logging.getLogger(__name__)


class Repository(ABC):
    # Both mercurial and git use sha1 as revision idenfiers. Luckily, both define
    # the same value as the null revision.
    #
    # https://github.com/git/git/blob/dc04167d378fb29d30e1647ff6ff51dd182bc9a3/t/oid-info/hash-info#L7
    # https://www.mercurial-scm.org/repo/hg-stable/file/82efc31bd152/mercurial/node.py#l30
    NULL_REVISION = "0000000000000000000000000000000000000000"

    def __init__(self, path):
        self.path = path
        self.binary = which(self.tool)
        if self.binary is None:
            raise OSError(f"{self.tool} not found!")
        self._valid_diff_filter = ("m", "a", "d")

        self._env = os.environ.copy()

    def run(self, *args: str, **kwargs):
        return_codes = kwargs.pop("return_codes", [])
        cmd = (self.binary,) + args

        try:
            return subprocess.check_output(
                cmd, cwd=self.path, env=self._env, encoding="utf-8", **kwargs
            )
        except subprocess.CalledProcessError as e:
            if e.returncode in return_codes:
                return ""
            raise

    @abstractproperty
    def tool(self) -> str:
        """Version control system being used, either 'hg' or 'git'."""

    @abstractproperty
    def head_rev(self) -> str:
        """Hash of HEAD revision."""

    @abstractproperty
    def base_rev(self):
        """Hash of revision the current topic branch is based on."""

    @abstractproperty
    def branch(self):
        """Current branch or bookmark the checkout has active."""

    @abstractproperty
    def all_remote_names(self):
        """Name of all configured remote repositories."""

    @abstractproperty
    def default_remote_name(self):
        """Name the VCS defines for the remote repository when cloning
        it for the first time. This name may not exist anymore if users
        changed the default configuration, for instance."""

    @abstractproperty
    def remote_name(self):
        """Name of the remote repository."""

    def _get_most_suitable_remote(self, remote_instructions):
        remotes = self.all_remote_names
        if len(remotes) == 1:
            return remotes[0]

        if self.default_remote_name in remotes:
            return self.default_remote_name

        first_remote = remotes[0]
        logger.warning(
            f"Unable to determine which remote repository to use between: {remotes}. "
            f'Arbitrarily using the first one "{first_remote}". Please set an '
            f"`{self.default_remote_name}` remote if the arbitrarily selected one "
            f"is not right. To do so: {remote_instructions}"
        )

        return first_remote

    @abstractproperty
    def default_branch(self):
        """Name of the default branch."""

    @abstractmethod
    def get_url(self, remote=None):
        """Get URL of the upstream repository."""

    @abstractmethod
    def get_commit_message(self, revision=None):
        """Commit message of specified revision or current commit."""

    @abstractmethod
    def get_changed_files(self, diff_filter, mode="unstaged", rev=None, base_rev=None):
        """Return a list of files that are changed in:
         * either this repository's working copy,
         * or at a given revision (``rev``)
         * or between 2 revisions (``base_rev`` and ``rev``)

        ``diff_filter`` controls which kinds of modifications are returned.
        It is a string which may only contain the following characters:

            A - Include files that were added
            D - Include files that were deleted
            M - Include files that were modified

        By default, all three will be included.

        ``mode`` can be one of 'unstaged', 'staged' or 'all'. Only has an
        effect on git. Defaults to 'unstaged'.

        ``rev`` is a specifier for which changesets to consider for
        changes. The exact meaning depends on the vcs system being used.

        ``base_rev`` specifies the range of changesets. This parameter cannot
        be used without ``rev``. The range includes ``rev`` but excludes
        ``base_rev``.
        """

    @abstractmethod
    def get_outgoing_files(self, diff_filter, upstream):
        """Return a list of changed files compared to upstream.

        ``diff_filter`` works the same as `get_changed_files`.
        ``upstream`` is a remote ref to compare against. If unspecified,
        this will be determined automatically. If there is no remote ref,
        a MissingUpstreamRepo exception will be raised.
        """

    @abstractmethod
    def working_directory_clean(self, untracked=False, ignored=False):
        """Determine if the working directory is free of modifications.

        Returns True if the working directory does not have any file
        modifications. False otherwise.

        By default, untracked and ignored files are not considered. If
        ``untracked`` or ``ignored`` are set, they influence the clean check
        to factor these file classes into consideration.
        """

    @abstractmethod
    def update(self, ref):
        """Update the working directory to the specified reference."""

    @abstractmethod
    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
        """Find the latest revision that is common to both the given
        ``head_rev`` and ``base_ref_or_rev``"""

    @abstractmethod
    def does_revision_exist_locally(self, revision):
        """Check whether this revision exists in the local repository.

        If this function returns an unexpected value, then make sure
        the revision was fetched from the remote repository."""


class HgRepository(Repository):
    tool = "hg"
    default_remote_name = "default"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._env["HGPLAIN"] = "1"

    @property
    def head_rev(self):
        return self.run("log", "-r", ".", "-T", "{node}").strip()

    @property
    def base_rev(self):
        return self.run("log", "-r", "last(ancestors(.) and public())", "-T", "{node}")

    @property
    def branch(self):
        bookmarks_fn = os.path.join(self.path, ".hg", "bookmarks.current")
        if os.path.exists(bookmarks_fn):
            with open(bookmarks_fn) as f:
                bookmark = f.read()
                return bookmark or None

        return None

    @property
    def all_remote_names(self):
        remotes = self.run("paths", "--quiet").splitlines()
        if not remotes:
            raise RuntimeError("No remotes defined")
        return remotes

    @property
    def remote_name(self):
        return self._get_most_suitable_remote(
            "Edit .hg/hgrc and add:\n\n[paths]\ndefault = $URL",
        )

    @property
    def default_branch(self):
        # Mercurial recommends keeping "default"
        # https://www.mercurial-scm.org/wiki/StandardBranching#Don.27t_use_a_name_other_than_default_for_your_main_development_branch
        return "default"

    def get_url(self, remote="default"):
        return self.run("path", "-T", "{url}", remote).strip()

    def get_commit_message(self, revision=None):
        revision = revision or self.head_rev
        return self.run("log", "-r", ".", "-T", "{desc}")

    def _format_diff_filter(self, diff_filter, for_status=False):
        df = diff_filter.lower()
        assert all(f in self._valid_diff_filter for f in df)

        # When looking at the changes in the working directory, the hg status
        # command uses 'd' for files that have been deleted with a non-hg
        # command, and 'r' for files that have been `hg rm`ed. Use both.
        return df.replace("d", "dr") if for_status else df

    def _files_template(self, diff_filter):
        template = ""
        df = self._format_diff_filter(diff_filter)
        if "a" in df:
            template += "{file_adds % '{file}\\n'}"
        if "d" in df:
            template += "{file_dels % '{file}\\n'}"
        if "m" in df:
            template += "{file_mods % '{file}\\n'}"
        return template

    def get_changed_files(
        self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None
    ):
        if rev is None:
            if base_rev is not None:
                raise ValueError("Cannot specify `base_rev` without `rev`")
            # Use --no-status to print just the filename.
            df = self._format_diff_filter(diff_filter, for_status=True)
            return self.run("status", "--no-status", f"-{df}").splitlines()
        else:
            template = self._files_template(diff_filter)
            revision_argument = rev if base_rev is None else f"{base_rev}~-1::{rev}"
            return self.run("log", "-r", revision_argument, "-T", template).splitlines()

    def get_outgoing_files(self, diff_filter="ADM", upstream=None):
        template = self._files_template(diff_filter)

        if not upstream:
            return self.run(
                "log", "-r", "draft() and ancestors(.)", "--template", template
            ).split()

        return self.run(
            "outgoing",
            "-r",
            ".",
            "--quiet",
            "--template",
            template,
            upstream,
            return_codes=(1,),
        ).split()

    def working_directory_clean(self, untracked=False, ignored=False):
        args = ["status", "--modified", "--added", "--removed", "--deleted"]
        if untracked:
            args.append("--unknown")
        if ignored:
            args.append("--ignored")

        # If output is empty, there are no entries of requested status, which
        # means we are clean.
        return not len(self.run(*args).strip())

    def update(self, ref):
        return self.run("update", "--check", ref)

    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
        return self.run(
            "log",
            "-r",
            f"last(ancestors('{base_ref_or_rev}') and ancestors('{head_rev}'))",
            "--template",
            "{node}",
        ).strip()

    def does_revision_exist_locally(self, revision):
        try:
            return self.run("log", "-r", revision).strip() != ""
        except subprocess.CalledProcessError as e:
            # Error code 255 comes with the message:
            # "abort: unknown revision $REVISION"
            if e.returncode == 255:
                return False
            raise


class GitRepository(Repository):
    tool = "git"
    default_remote_name = "origin"

    _LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P<branch_name>\S+)\s+HEAD")

    @property
    def head_rev(self):
        return self.run("rev-parse", "--verify", "HEAD").strip()

    @property
    def base_rev(self):
        refs = self.run(
            "rev-list", "HEAD", "--topo-order", "--boundary", "--not", "--remotes"
        ).splitlines()
        if refs:
            return refs[-1][1:]  # boundary starts with a prefix `-`
        return self.head_rev

    @property
    def branch(self):
        return self.run("branch", "--show-current").strip() or None

    @property
    def all_remote_names(self):
        remotes = self.run("remote").splitlines()
        if not remotes:
            raise RuntimeError("No remotes defined")
        return remotes

    @property
    def remote_name(self):
        try:
            remote_branch_name = self.run(
                "rev-parse", "--verify", "--abbrev-ref", "--symbolic-full-name", "@{u}"
            ).strip()
            return remote_branch_name.split("/")[0]
        except subprocess.CalledProcessError as e:
            # Error code 128 comes with the message:
            # "fatal: no upstream configured for branch $BRANCH"
            if e.returncode != 128:
                raise

        return self._get_most_suitable_remote("`git remote add origin $URL`")

    @property
    def default_branch(self):
        try:
            # this one works if the current repo was cloned from an existing
            # repo elsewhere
            return self._get_default_branch_from_cloned_metadata()
        except (subprocess.CalledProcessError, RuntimeError):
            pass

        try:
            # This call works if you have (network) access to the repo
            return self._get_default_branch_from_remote_query()
        except (subprocess.CalledProcessError, RuntimeError):
            pass

        # this one is the last resort in case the remote is not accessible and
        # the local repo is where `git init` was made
        return self._guess_default_branch()

    def _get_default_branch_from_remote_query(self):
        # This function requires network access to the repo
        remote_name = self.remote_name
        output = self.run("ls-remote", "--symref", remote_name, "HEAD")
        matches = self._LS_REMOTE_PATTERN.search(output)
        if not matches:
            raise RuntimeError(
                f'Could not find the default branch of remote repository "{remote_name}". '
                "Got: {output}"
            )

        branch_name = matches.group("branch_name")
        return f"{remote_name}/{branch_name}"

    def _get_default_branch_from_cloned_metadata(self):
        return self.run("rev-parse", "--abbrev-ref", f"{self.remote_name}/HEAD").strip()

    def _guess_default_branch(self):
        branches = [
            line.strip()
            for line in self.run(
                "branch", "--all", "--no-color", "--format=%(refname)"
            ).splitlines()
            for candidate_branch in ("main", "master", "branches/default/tip")
            if line.strip().endswith(candidate_branch)
        ]

        if len(branches) == 1:
            return branches[0]

        raise RuntimeError(f"Unable to find default branch. Got: {branches}")

    def get_url(self, remote="origin"):
        return self.run("remote", "get-url", remote).strip()

    def get_commit_message(self, revision=None):
        revision = revision or self.head_rev
        return self.run("log", "-n1", "--format=%B")

    def get_changed_files(
        self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None
    ):
        assert all(f.lower() in self._valid_diff_filter for f in diff_filter)

        if rev is None:
            if base_rev is not None:
                raise ValueError("Cannot specify `base_rev` without `rev`")
            cmd = ["diff"]
            if mode == "staged":
                cmd.append("--cached")
            elif mode == "all":
                cmd.append("HEAD")
        else:
            revision_argument = (
                f"{rev}~1..{rev}" if base_rev is None else f"{base_rev}..{rev}"
            )
            cmd = ["log", "--format=format:", revision_argument]

        cmd.append("--name-only")
        cmd.append("--diff-filter=" + diff_filter.upper())

        files = self.run(*cmd).splitlines()
        return [f for f in files if f]

    def get_outgoing_files(self, diff_filter="ADM", upstream=None):
        assert all(f.lower() in self._valid_diff_filter for f in diff_filter)

        not_condition = upstream if upstream else "--remotes"

        files = self.run(
            "log",
            "--name-only",
            f"--diff-filter={diff_filter.upper()}",
            "--oneline",
            "--pretty=format:",
            "HEAD",
            "--not",
            not_condition,
        ).splitlines()
        return [f for f in files if f]

    def working_directory_clean(self, untracked=False, ignored=False):
        args = ["status", "--porcelain"]

        # Even in --porcelain mode, behavior is affected by the
        # ``status.showUntrackedFiles`` option, which means we need to be
        # explicit about how to treat untracked files.
        if untracked:
            args.append("--untracked-files=all")
        else:
            args.append("--untracked-files=no")

        if ignored:
            args.append("--ignored")

        # If output is empty, there are no entries of requested status, which
        # means we are clean.
        return not len(self.run(*args).strip())

    def update(self, ref):
        self.run("checkout", ref)

    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
        return self.run("merge-base", base_ref_or_rev, head_rev).strip()

    def does_revision_exist_locally(self, revision):
        try:
            return self.run("cat-file", "-t", revision).strip() == "commit"
        except subprocess.CalledProcessError as e:
            # Error code 128 comes with the message:
            # "git cat-file: could not get object info"
            if e.returncode == 128:
                return False
            raise


def get_repository(path):
    """Get a repository object for the repository at `path`.
    If `path` is not a known VCS repository, raise an exception.
    """
    for path in ancestors(path):
        if os.path.isdir(os.path.join(path, ".hg")):
            return HgRepository(path)
        elif os.path.exists(os.path.join(path, ".git")):
            return GitRepository(path)

    raise RuntimeError("Current directory is neither a git or hg repository")


def find_hg_revision_push_info(repository, revision):
    """Given the parameters for this action and a revision, find the
    pushlog_id of the revision."""
    pushlog_url = PUSHLOG_TMPL.format(repository, revision)

    def query_pushlog(url):
        r = requests.get(pushlog_url, timeout=60)
        r.raise_for_status()
        return r

    r = retry(
        query_pushlog,
        args=(pushlog_url,),
        attempts=5,
        sleeptime=10,
    )
    pushes = r.json()["pushes"]
    if len(pushes) != 1:
        raise RuntimeError(
            "Unable to find a single pushlog_id for {} revision {}: {}".format(
                repository, revision, pushes
            )
        )
    pushid = list(pushes.keys())[0]
    return {
        "pushdate": pushes[pushid]["date"],
        "pushid": pushid,
        "user": pushes[pushid]["user"],
    }