summaryrefslogtreecommitdiffstats
path: root/third_party/python/taskcluster_taskgraph/taskgraph/util/archive.py
blob: 261a031038ab7e36196576fb2d87719a6fb96c5f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.


import gzip
import os
import stat
import tarfile

# 2016-01-01T00:00:00+0000
DEFAULT_MTIME = 1451606400


# Python 3.9 contains this change:
#  https://github.com/python/cpython/commit/674935b8caf33e47c78f1b8e197b1b77a04992d2
# which changes the output of tar creation compared to earlier versions.
# As this code is used to generate tar files that are meant to be deterministic
# across versions of python (specifically, it's used as part of computing the hash
# of docker images, which needs to be identical between CI (which uses python 3.8),
# and developer environments (using arbitrary versions of python, at this point,
# most probably more recent than 3.9)).
# What we do is subblass TarInfo so that if used on python >= 3.9, it reproduces the
# behavior from python < 3.9.
# Here's how it goes:
# - the behavior in python >= 3.9 is the same as python < 3.9 when the type encoded
# in the tarinfo is CHRTYPE or BLKTYPE.
# - the value of the type is only compared in the context of choosing which behavior
# to take
# - we replace the type with the same value (so that using the value has no changes)
# but that pretends to be the same as CHRTYPE so that the condition that enables the
# old behavior is taken.
class HackedType(bytes):
    def __eq__(self, other):
        if other == tarfile.CHRTYPE:
            return True
        return self == other


class TarInfo(tarfile.TarInfo):
    @staticmethod
    def _create_header(info, format, encoding, errors):
        info["type"] = HackedType(info["type"])
        # ignore type checking because it looks like pyright complains because we're calling a
        # non-public method
        return tarfile.TarInfo._create_header(info, format, encoding, errors)  # type: ignore


def create_tar_from_files(fp, files):
    """Create a tar file deterministically.

    Receives a dict mapping names of files in the archive to local filesystem
    paths or ``mozpack.files.BaseFile`` instances.

    The files will be archived and written to the passed file handle opened
    for writing.

    Only regular files can be written.

    FUTURE accept a filename argument (or create APIs to write files)
    """
    # The format is explicitly set to tarfile.GNU_FORMAT, because this default format
    # has been changed in Python 3.8.
    with tarfile.open(
        name="", mode="w", fileobj=fp, dereference=True, format=tarfile.GNU_FORMAT
    ) as tf:
        for archive_path, f in sorted(files.items()):
            if isinstance(f, str):
                s = os.stat(f)
                mode = s.st_mode
                size = s.st_size
                f = open(f, "rb")
            else:
                mode = 0o0644
                size = len(f.read())
                f.seek(0)

            ti = TarInfo(archive_path)
            ti.mode = mode
            ti.type = tarfile.REGTYPE

            if not ti.isreg():
                raise ValueError("not a regular file: %s" % f)

            # Disallow setuid and setgid bits. This is an arbitrary restriction.
            # However, since we set uid/gid to root:root, setuid and setgid
            # would be a glaring security hole if the archive were
            # uncompressed as root.
            if ti.mode & (stat.S_ISUID | stat.S_ISGID):
                raise ValueError("cannot add file with setuid or setgid set: " "%s" % f)

            # Set uid, gid, username, and group as deterministic values.
            ti.uid = 0
            ti.gid = 0
            ti.uname = ""
            ti.gname = ""

            # Set mtime to a constant value.
            ti.mtime = DEFAULT_MTIME

            ti.size = size
            # tarfile wants to pass a size argument to read(). So just
            # wrap/buffer in a proper file object interface.
            tf.addfile(ti, f)


def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9):
    """Create a tar.gz file deterministically from files.

    This is a glorified wrapper around ``create_tar_from_files`` that
    adds gzip compression.

    The passed file handle should be opened for writing in binary mode.
    When the function returns, all data has been written to the handle.
    """
    # Offset 3-7 in the gzip header contains an mtime. Pin it to a known
    # value so output is deterministic.
    gf = gzip.GzipFile(
        filename=filename or "",
        mode="wb",
        fileobj=fp,
        compresslevel=compresslevel,
        mtime=DEFAULT_MTIME,
    )
    with gf:
        create_tar_from_files(gf, files)