summaryrefslogtreecommitdiffstats
path: root/tarfilter
blob: ad776167a8473d5d15dbe22e850f4f6db35cf278 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/env python3
#
# This script is in the public domain
#
# Author: Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
#
# This script accepts a tarball on standard input and filters it according to
# the same rules used by dpkg --path-exclude and --path-include, using command
# line options of the same name. The result is then printed on standard output.
#
# A tool like this should be written in C but libarchive has issues:
# https://github.com/libarchive/libarchive/issues/587
# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1)
# Should these issues get fixed, then a good template is tarfilter.c in the
# examples directory of libarchive.
#
# We are not using Perl either, because Archive::Tar slurps the whole tarball
# into memory.
#
# We could also use Go but meh...
# https://stackoverflow.com/a/59542307/784669

import tarfile
import sys
import argparse
import fnmatch
import re


class PathFilterAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        items = getattr(namespace, "pathfilter", [])
        regex = re.compile(fnmatch.translate(values))
        items.append((self.dest, regex))
        setattr(namespace, "pathfilter", items)


class PaxFilterAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        items = getattr(namespace, "paxfilter", [])
        regex = re.compile(fnmatch.translate(values))
        items.append((self.dest, regex))
        setattr(namespace, "paxfilter", items)


class TypeFilterAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        items = getattr(namespace, "typefilter", [])
        match values:
            case "REGTYPE" | "0":
                items.append(tarfile.REGTYPE)
            case "LNKTYPE" | "1":
                items.append(tarfile.LNKTYPE)
            case "SYMTYPE" | "2":
                items.append(tarfile.SYMTYPE)
            case "CHRTYPE" | "3":
                items.append(tarfile.CHRTYPE)
            case "BLKTYPE" | "4":
                items.append(tarfile.BLKTYPE)
            case "DIRTYPE" | "5":
                items.append(tarfile.DIRTYPE)
            case "FIFOTYPE" | "6":
                items.append(tarfile.FIFOTYPE)
            case _:
                raise ValueError("invalid type: %s" % values)
        setattr(namespace, "typefilter", items)


class TransformAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        items = getattr(namespace, "trans", [])
        # This function mimics what src/transform.c from tar does
        if not values.startswith("s"):
            raise ValueError("regex must start with an 's'")
        if len(values) <= 4:
            # minimum regex: s/x//
            raise ValueError("invalid regex (too short)")
        d = values[1]
        if values.startswith(f"s{d}{d}"):
            raise ValueError("empty regex")
        values = values.removeprefix(f"s{d}")
        flags = 0
        if values.endswith(f"{d}i"):
            # trailing flags
            flags = re.IGNORECASE
            values = values.removesuffix(f"{d}i")
        # This regex only finds non-empty tokens.
        # Finding empty tokens would require a variable length look-behind
        # or \K in order to find escaped delimiters which is not supported by
        # the python re module.
        tokens = re.findall(rf"(?:\\[\\{d}]|[^{d}])+", values)
        match len(tokens):
            case 0:
                raise ValueError("invalid regex: not enough terms")
            case 1:
                repl = ""
            case 2:
                repl = tokens[1]
            case _:
                raise ValueError("invalid regex: too many terms: %s" % tokens)
        items.append((re.compile(tokens[0], flags), repl))
        setattr(namespace, "trans", items)


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""\
Filters a tarball on standard input by the same rules as the dpkg --path-exclude
and --path-include options and writes resulting tarball to standard output. See
dpkg(1) for information on how these two options work in detail. To reuse the
exact same semantics as used by dpkg, paths must be given as /path and not as
./path even though they might be stored as such in the tarball.

Secondly, filter out unwanted pax extended headers using --pax-exclude and
--pax-include. This is useful in cases where a tool only accepts certain xattr
prefixes. For example tar2sqfs only supports SCHILY.xattr.user.*,
SCHILY.xattr.trusted.* and SCHILY.xattr.security.* but not
SCHILY.xattr.system.posix_acl_default.*.

Both types of options use Unix shell-style wildcards:

       * matches everything
       ? matches any single character
   [seq] matches any character in seq
  [!seq] matches any character not in seq

Thirdly, filter out files matching a specific tar archive member type using
--type-exclude. Valid type names are REGTYPE (regular file), LNKTYPE
(hardlink), SYMTYPE (symlink), CHRTYPE (character special), BLKTYPE (block
special), DIRTYPE (directory), FIFOTYPE (fifo) or their tar format flag value
(0-6, respectively).

Fourthly, transform the path of tar members using a sed expression just as with
GNU tar --transform.

Fifthly, strip leading directory components off of tar members. Just as with
GNU tar --strip-components, tar members that have less or equal components in
their path are not passed through.

Lastly, shift user id and group id of each entry by the value given by the
--idshift argument. The resulting uid or gid must not be negative.
""",
    )
    parser.add_argument(
        "--path-exclude",
        metavar="pattern",
        action=PathFilterAction,
        help="Exclude path matching the given shell pattern. "
        "This option can be specified multiple times.",
    )
    parser.add_argument(
        "--path-include",
        metavar="pattern",
        action=PathFilterAction,
        help="Re-include a pattern after a previous exclusion. "
        "This option can be specified multiple times.",
    )
    parser.add_argument(
        "--pax-exclude",
        metavar="pattern",
        action=PaxFilterAction,
        help="Exclude pax header matching the given globbing pattern. "
        "This option can be specified multiple times.",
    )
    parser.add_argument(
        "--pax-include",
        metavar="pattern",
        action=PaxFilterAction,
        help="Re-include a pax header after a previous exclusion. "
        "This option can be specified multiple times.",
    )
    parser.add_argument(
        "--type-exclude",
        metavar="type",
        action=TypeFilterAction,
        help="Exclude certain member types by their type. Choose types either "
        "by their name (REGTYPE, LNKTYPE, SYMTYPE, CHRTYPE, BLKTYPE, DIRTYPE, "
        "FIFOTYPE) or by their tar format flag values (0-6, respectively). "
        "This option can be specified multiple times.",
    )
    parser.add_argument(
        "--transform",
        "--xform",
        metavar="EXPRESSION",
        action=TransformAction,
        help="Use sed replace EXPRESSION to transform file names. "
        "This option can be specified multiple times.",
    )
    parser.add_argument(
        "--strip-components",
        metavar="NUMBER",
        type=int,
        help="Strip NUMBER leading components from file names",
    )
    parser.add_argument(
        "--idshift",
        metavar="NUM",
        type=int,
        help="Integer value by which to shift the uid and gid of each entry",
    )
    args = parser.parse_args()
    if (
        not hasattr(args, "pathfilter")
        and not hasattr(args, "paxfilter")
        and not hasattr(args, "typefilter")
        and not hasattr(args, "strip_components")
    ):
        from shutil import copyfileobj

        copyfileobj(sys.stdin.buffer, sys.stdout.buffer)
        exit()

    # same logic as in dpkg/src/filters.c/filter_should_skip()
    prefix_prog = re.compile(r"^([^*?[\\]*).*")

    def path_filter_should_skip(member):
        skip = False
        if not hasattr(args, "pathfilter"):
            return False
        # normalize path and make it absolute by stripping off all leading
        # dots and slashes and then prepending a slash
        name = "/" + member.name.lstrip("./")
        for t, r in args.pathfilter:
            if r.match(name) is not None:
                if t == "path_include":
                    skip = False
                else:
                    skip = True
        if skip and (member.isdir() or member.issym()):
            for t, r in args.pathfilter:
                if t != "path_include":
                    continue
                prefix = prefix_prog.sub(r"\1", r.pattern)
                prefix = prefix.rstrip("/")
                if name.startswith(prefix):
                    return False
        return skip

    def pax_filter_should_skip(header):
        if not hasattr(args, "paxfilter"):
            return False
        skip = False
        for t, r in args.paxfilter:
            if r.match(header) is None:
                continue
            if t == "pax_include":
                skip = False
            else:
                skip = True
        return skip

    def type_filter_should_skip(member):
        if not hasattr(args, "typefilter"):
            return False
        for t in args.typefilter:
            if member.type == t:
                return True
        return False

    # starting with Python 3.8, the default format became PAX_FORMAT but we
    # are still explicit here in case of future changes.
    with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open(
        fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT
    ) as out_tar:
        for member in in_tar:
            if path_filter_should_skip(member):
                continue
            if type_filter_should_skip(member):
                continue
            if args.strip_components:
                comps = member.name.split("/")
                # just as with GNU tar, archive members with less or equal
                # number of components are not passed through at all
                if len(comps) <= args.strip_components:
                    continue
                member.name = "/".join(comps[args.strip_components :])
            member.pax_headers = {
                k: v
                for k, v in member.pax_headers.items()
                if not pax_filter_should_skip(k)
            }
            if args.idshift:
                if args.idshift < 0 and -args.idshift > member.uid:
                    print("uid cannot be negative", file=sys.stderr)
                    exit(1)
                if args.idshift < 0 and -args.idshift > member.gid:
                    print("gid cannot be negative", file=sys.stderr)
                    exit(1)
                member.uid += args.idshift
                member.gid += args.idshift
            if hasattr(args, "trans"):
                for r, s in args.trans:
                    member.name = r.sub(s, member.name)
            if member.isfile():
                with in_tar.extractfile(member) as file:
                    out_tar.addfile(member, file)
            else:
                out_tar.addfile(member)


if __name__ == "__main__":
    main()