summaryrefslogtreecommitdiffstats
path: root/python/mozbuild/mozpack/archive.py
blob: 89bf14b1795cd5429e4222b3758f33dc8463158e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import bz2
import gzip
import stat
import tarfile

from .files import BaseFile, File

# 2016-01-01T00:00:00+0000
DEFAULT_MTIME = 1451606400


# Python 3.9 contains this change:
#  https://github.com/python/cpython/commit/674935b8caf33e47c78f1b8e197b1b77a04992d2
# which changes the output of tar creation compared to earlier versions.
# As this code is used to generate tar files that are meant to be deterministic
# across versions of python (specifically, it's used as part of computing the hash
# of docker images, which needs to be identical between CI (which uses python 3.8),
# and developer environments (using arbitrary versions of python, at this point,
# most probably more recent than 3.9)).
# What we do is subblass TarInfo so that if used on python >= 3.9, it reproduces the
# behavior from python < 3.9.
# Here's how it goes:
# - the behavior in python >= 3.9 is the same as python < 3.9 when the type encoded
# in the tarinfo is CHRTYPE or BLKTYPE.
# - the value of the type is only compared in the context of choosing which behavior
# to take
# - we replace the type with the same value (so that using the value has no changes)
# but that pretends to be the same as CHRTYPE so that the condition that enables the
# old behavior is taken.
class HackedType(bytes):
    def __eq__(self, other):
        if other == tarfile.CHRTYPE:
            return True
        return self == other


class TarInfo(tarfile.TarInfo):
    @staticmethod
    def _create_header(info, format, encoding, errors):
        info["type"] = HackedType(info["type"])
        return tarfile.TarInfo._create_header(info, format, encoding, errors)


def create_tar_from_files(fp, files):
    """Create a tar file deterministically.

    Receives a dict mapping names of files in the archive to local filesystem
    paths or ``mozpack.files.BaseFile`` instances.

    The files will be archived and written to the passed file handle opened
    for writing.

    Only regular files can be written.

    FUTURE accept a filename argument (or create APIs to write files)
    """
    # The format is explicitly set to tarfile.GNU_FORMAT, because this default format
    # has been changed in Python 3.8.
    with tarfile.open(
        name="", mode="w", fileobj=fp, dereference=True, format=tarfile.GNU_FORMAT
    ) as tf:
        for archive_path, f in sorted(files.items()):
            if not isinstance(f, BaseFile):
                f = File(f)

            ti = TarInfo(archive_path)
            ti.mode = f.mode or 0o0644
            ti.type = tarfile.REGTYPE

            if not ti.isreg():
                raise ValueError("not a regular file: %s" % f)

            # Disallow setuid and setgid bits. This is an arbitrary restriction.
            # However, since we set uid/gid to root:root, setuid and setgid
            # would be a glaring security hole if the archive were
            # uncompressed as root.
            if ti.mode & (stat.S_ISUID | stat.S_ISGID):
                raise ValueError("cannot add file with setuid or setgid set: " "%s" % f)

            # Set uid, gid, username, and group as deterministic values.
            ti.uid = 0
            ti.gid = 0
            ti.uname = ""
            ti.gname = ""

            # Set mtime to a constant value.
            ti.mtime = DEFAULT_MTIME

            ti.size = f.size()
            # tarfile wants to pass a size argument to read(). So just
            # wrap/buffer in a proper file object interface.
            tf.addfile(ti, f.open())


def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9):
    """Create a tar.gz file deterministically from files.

    This is a glorified wrapper around ``create_tar_from_files`` that
    adds gzip compression.

    The passed file handle should be opened for writing in binary mode.
    When the function returns, all data has been written to the handle.
    """
    # Offset 3-7 in the gzip header contains an mtime. Pin it to a known
    # value so output is deterministic.
    gf = gzip.GzipFile(
        filename=filename or "",
        mode="wb",
        fileobj=fp,
        compresslevel=compresslevel,
        mtime=DEFAULT_MTIME,
    )
    with gf:
        create_tar_from_files(gf, files)


class _BZ2Proxy(object):
    """File object that proxies writes to a bz2 compressor."""

    def __init__(self, fp, compresslevel=9):
        self.fp = fp
        self.compressor = bz2.BZ2Compressor(compresslevel)
        self.pos = 0

    def tell(self):
        return self.pos

    def write(self, data):
        data = self.compressor.compress(data)
        self.pos += len(data)
        self.fp.write(data)

    def close(self):
        data = self.compressor.flush()
        self.pos += len(data)
        self.fp.write(data)


def create_tar_bz2_from_files(fp, files, compresslevel=9):
    """Create a tar.bz2 file deterministically from files.

    This is a glorified wrapper around ``create_tar_from_files`` that
    adds bzip2 compression.

    This function is similar to ``create_tar_gzip_from_files()``.
    """
    proxy = _BZ2Proxy(fp, compresslevel=compresslevel)
    create_tar_from_files(proxy, files)
    proxy.close()