taskcluster/taskgraph/util/docker.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from __future__ import absolute_import, print_function, unicode_literals

import hashlib
import io
import json
import os
import re
import requests
import requests_unixsocket
import six
import sys

from six.moves.urllib.parse import quote, urlencode, urlunparse
from six.moves.collections_abc import Mapping

from mozbuild.util import memoize
from mozpack.files import GeneratedFile
from mozpack.archive import create_tar_gz_from_files
from .. import GECKO

from .yaml import load_yaml


IMAGE_DIR = os.path.join(GECKO, "taskcluster", "docker")


def docker_url(path, **kwargs):
    docker_socket = os.environ.get("DOCKER_SOCKET", "/var/run/docker.sock")
    return urlunparse(
        ("http+unix", quote(docker_socket, safe=""), path, "", urlencode(kwargs), "")
    )


def post_to_docker(tar, api_path, **kwargs):
    """POSTs a tar file to a given docker API path.

    The tar argument can be anything that can be passed to requests.post()
    as data (e.g. iterator or file object).
    The extra keyword arguments are passed as arguments to the docker API.
    """
    # requests-unixsocket doesn't honor requests timeouts
    # See https://github.com/msabramo/requests-unixsocket/issues/44
    # We have some large docker images that trigger the default timeout,
    # so we increase the requests-unixsocket timeout here.
    session = requests.Session()
    session.mount(
        requests_unixsocket.DEFAULT_SCHEME,
        requests_unixsocket.UnixAdapter(timeout=120),
    )
    req = session.post(
        docker_url(api_path, **kwargs),
        data=tar,
        stream=True,
        headers={"Content-Type": "application/x-tar"},
    )
    if req.status_code != 200:
        message = req.json().get("message")
        if not message:
            message = "docker API returned HTTP code {}".format(req.status_code)
        raise Exception(message)
    status_line = {}

    buf = b""
    for content in req.iter_content(chunk_size=None):
        if not content:
            continue
        # Sometimes, a chunk of content is not a complete json, so we cumulate
        # with leftovers from previous iterations.
        buf += content
        try:
            data = json.loads(buf)
        except Exception:
            continue
        buf = b""
        # data is sometimes an empty dict.
        if not data:
            continue
        # Mimick how docker itself presents the output. This code was tested
        # with API version 1.18 and 1.26.
        if "status" in data:
            if "id" in data:
                if sys.stderr.isatty():
                    total_lines = len(status_line)
                    line = status_line.setdefault(data["id"], total_lines)
                    n = total_lines - line
                    if n > 0:
                        # Move the cursor up n lines.
                        sys.stderr.write("\033[{}A".format(n))
                    # Clear line and move the cursor to the beginning of it.
                    sys.stderr.write("\033[2K\r")
                    sys.stderr.write(
                        "{}: {} {}\n".format(
                            data["id"], data["status"], data.get("progress", "")
                        )
                    )
                    if n > 1:
                        # Move the cursor down n - 1 lines, which, considering
                        # the carriage return on the last write, gets us back
                        # where we started.
                        sys.stderr.write("\033[{}B".format(n - 1))
                else:
                    status = status_line.get(data["id"])
                    # Only print status changes.
                    if status != data["status"]:
                        sys.stderr.write("{}: {}\n".format(data["id"], data["status"]))
                        status_line[data["id"]] = data["status"]
            else:
                status_line = {}
                sys.stderr.write("{}\n".format(data["status"]))
        elif "stream" in data:
            sys.stderr.write(data["stream"])
        elif "aux" in data:
            sys.stderr.write(repr(data["aux"]))
        elif "error" in data:
            sys.stderr.write("{}\n".format(data["error"]))
            # Sadly, docker doesn't give more than a plain string for errors,
            # so the best we can do to propagate the error code from the command
            # that failed is to parse the error message...
            errcode = 1
            m = re.search(r"returned a non-zero code: (\d+)", data["error"])
            if m:
                errcode = int(m.group(1))
            sys.exit(errcode)
        else:
            raise NotImplementedError(repr(data))
        sys.stderr.flush()


def docker_image(name, by_tag=False):
    """
    Resolve in-tree prebuilt docker image to ``<registry>/<repository>@sha256:<digest>``,
    or ``<registry>/<repository>:<tag>`` if `by_tag` is `True`.
    """
    try:
        with open(os.path.join(IMAGE_DIR, name, "REGISTRY")) as f:
            registry = f.read().strip()
    except IOError:
        with open(os.path.join(IMAGE_DIR, "REGISTRY")) as f:
            registry = f.read().strip()

    if not by_tag:
        hashfile = os.path.join(IMAGE_DIR, name, "HASH")
        try:
            with open(hashfile) as f:
                return "{}/{}@{}".format(registry, name, f.read().strip())
        except IOError:
            raise Exception("Failed to read HASH file {}".format(hashfile))

    try:
        with open(os.path.join(IMAGE_DIR, name, "VERSION")) as f:
            tag = f.read().strip()
    except IOError:
        tag = "latest"
    return "{}/{}:{}".format(registry, name, tag)


class VoidWriter(object):
    """A file object with write capabilities that does nothing with the written
    data."""

    def write(self, buf):
        pass


def generate_context_hash(topsrcdir, image_path, image_name, args):
    """Generates a sha256 hash for context directory used to build an image."""

    return stream_context_tar(
        topsrcdir, image_path, VoidWriter(), image_name, args=args
    )


class HashingWriter(object):
    """A file object with write capabilities that hashes the written data at
    the same time it passes down to a real file object."""

    def __init__(self, writer):
        self._hash = hashlib.sha256()
        self._writer = writer

    def write(self, buf):
        self._hash.update(buf)
        self._writer.write(buf)

    def hexdigest(self):
        return six.ensure_text(self._hash.hexdigest())


def create_context_tar(topsrcdir, context_dir, out_path, image_name, args):
    """Create a context tarball.

    A directory ``context_dir`` containing a Dockerfile will be assembled into
    a gzipped tar file at ``out_path``.

    We also scan the source Dockerfile for special syntax that influences
    context generation.

    If a line in the Dockerfile has the form ``# %include <path>``,
    the relative path specified on that line will be matched against
    files in the source repository and added to the context under the
    path ``topsrcdir/``. If an entry is a directory, we add all files
    under that directory.

    If a line in the Dockerfile has the form ``# %ARG <name>``, occurrences of
    the string ``$<name>`` in subsequent lines are replaced with the value
    found in the ``args`` argument. Exception: this doesn't apply to VOLUME
    definitions.

    Returns the SHA-256 hex digest of the created archive.
    """
    with open(out_path, "wb") as fh:
        return stream_context_tar(
            topsrcdir,
            context_dir,
            fh,
            image_name=image_name,
            args=args,
        )


def stream_context_tar(topsrcdir, context_dir, out_file, image_name, args):
    """Like create_context_tar, but streams the tar file to the `out_file` file
    object."""
    archive_files = {}
    replace = []
    content = []

    context_dir = os.path.join(topsrcdir, context_dir)

    for root, dirs, files in os.walk(context_dir):
        for f in files:
            source_path = os.path.join(root, f)
            archive_path = source_path[len(context_dir) + 1 :]
            archive_files[archive_path] = source_path

    # Parse Dockerfile for special syntax of extra files to include.
    with io.open(os.path.join(context_dir, "Dockerfile"), "r") as fh:
        for line in fh:
            if line.startswith("# %ARG"):
                p = line[len("# %ARG ") :].strip()
                if not args or p not in args:
                    raise Exception("missing argument: {}".format(p))
                replace.append((re.compile(r"\${}\b".format(p)), args[p]))
                continue

            for regexp, s in replace:
                line = re.sub(regexp, s, line)

            content.append(line)

            if not line.startswith("# %include"):
                continue

            p = line[len("# %include ") :].strip()
            if os.path.isabs(p):
                raise Exception("extra include path cannot be absolute: %s" % p)

            fs_path = os.path.normpath(os.path.join(topsrcdir, p))
            # Check for filesystem traversal exploits.
            if not fs_path.startswith(topsrcdir):
                raise Exception("extra include path outside topsrcdir: %s" % p)

            if not os.path.exists(fs_path):
                raise Exception("extra include path does not exist: %s" % p)

            if os.path.isdir(fs_path):
                for root, dirs, files in os.walk(fs_path):
                    for f in files:
                        source_path = os.path.join(root, f)
                        rel = source_path[len(fs_path) + 1 :]
                        archive_path = os.path.join("topsrcdir", p, rel)
                        archive_files[archive_path] = source_path
            else:
                archive_path = os.path.join("topsrcdir", p)
                archive_files[archive_path] = fs_path

    archive_files["Dockerfile"] = GeneratedFile(
        b"".join(six.ensure_binary(s) for s in content)
    )

    writer = HashingWriter(out_file)
    create_tar_gz_from_files(writer, archive_files, "{}.tar".format(image_name))
    return writer.hexdigest()


class ImagePathsMap(Mapping):
    """ImagePathsMap contains the mapping of Docker image names to their
    context location in the filesystem. The register function allows Thunderbird
    to define additional images under comm/taskcluster.
    """

    def __init__(self, config_path, image_dir=IMAGE_DIR):
        config = load_yaml(GECKO, config_path)
        self.__update_image_paths(config["jobs"], image_dir)

    def __getitem__(self, key):
        return self.__dict__[key]

    def __iter__(self):
        return iter(self.__dict__)

    def __len__(self):
        return len(self.__dict__)

    def __update_image_paths(self, jobs, image_dir):
        self.__dict__.update(
            {
                k: os.path.join(image_dir, v.get("definition", k))
                for k, v in jobs.items()
            }
        )

    def register(self, jobs_config_path, image_dir):
        """Register additional image_paths. In this case, there is no 'jobs'
        key in the loaded YAML as this file is loaded via jobs-from in kind.yml."""
        jobs = load_yaml(GECKO, jobs_config_path)
        self.__update_image_paths(jobs, image_dir)


image_paths = ImagePathsMap("taskcluster/ci/docker-image/kind.yml")


def image_path(name):
    if name in image_paths:
        return image_paths[name]
    return os.path.join(IMAGE_DIR, name)


@memoize
def parse_volumes(image):
    """Parse VOLUME entries from a Dockerfile for an image."""
    volumes = set()

    path = image_path(image)

    with open(os.path.join(path, "Dockerfile"), "rb") as fh:
        for line in fh:
            line = line.strip()
            # We assume VOLUME definitions don't use %ARGS.
            if not line.startswith(b"VOLUME "):
                continue

            v = line.split(None, 1)[1]
            if v.startswith(b"["):
                raise ValueError(
                    "cannot parse array syntax for VOLUME; "
                    "convert to multiple entries"
                )

            volumes |= set([six.ensure_text(v) for v in v.split()])

    return volumes