diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 14:14:39 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 14:14:39 +0000 |
commit | ee17e45964b786b48b455959dfe68715971893fb (patch) | |
tree | 118f40aa65dc838499053413b05adfd00f839c62 /tarfilter | |
parent | Initial commit. (diff) | |
download | mmdebstrap-ee17e45964b786b48b455959dfe68715971893fb.tar.xz mmdebstrap-ee17e45964b786b48b455959dfe68715971893fb.zip |
Adding upstream version 1.4.3.upstream/1.4.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tarfilter')
-rwxr-xr-x | tarfilter | 300 |
1 files changed, 300 insertions, 0 deletions
diff --git a/tarfilter b/tarfilter new file mode 100755 index 0000000..66ef229 --- /dev/null +++ b/tarfilter @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# +# This script is in the public domain +# +# Author: Johannes Schauer Marin Rodrigues <josch@mister-muffin.de> +# +# This script accepts a tarball on standard input and filters it according to +# the same rules used by dpkg --path-exclude and --path-include, using command +# line options of the same name. The result is then printed on standard output. +# +# A tool like this should be written in C but libarchive has issues: +# https://github.com/libarchive/libarchive/issues/587 +# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1) +# Should these issues get fixed, then a good template is tarfilter.c in the +# examples directory of libarchive. +# +# We are not using Perl either, because Archive::Tar slurps the whole tarball +# into memory. +# +# We could also use Go but meh... +# https://stackoverflow.com/a/59542307/784669 + +import tarfile +import sys +import argparse +import fnmatch +import re + + +class PathFilterAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + items = getattr(namespace, "pathfilter", []) + regex = re.compile(fnmatch.translate(values)) + items.append((self.dest, regex)) + setattr(namespace, "pathfilter", items) + + +class PaxFilterAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + items = getattr(namespace, "paxfilter", []) + regex = re.compile(fnmatch.translate(values)) + items.append((self.dest, regex)) + setattr(namespace, "paxfilter", items) + + +class TypeFilterAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + items = getattr(namespace, "typefilter", []) + match values: + case "REGTYPE" | "0": + items.append(tarfile.REGTYPE) + case "LNKTYPE" | "1": + items.append(tarfile.LNKTYPE) + case "SYMTYPE" | "2": + items.append(tarfile.SYMTYPE) + case "CHRTYPE" | "3": + items.append(tarfile.CHRTYPE) + case "BLKTYPE" | "4": + items.append(tarfile.BLKTYPE) + case "DIRTYPE" | "5": + items.append(tarfile.DIRTYPE) + case "FIFOTYPE" | "6": + items.append(tarfile.FIFOTYPE) + case _: + raise ValueError("invalid type: %s" % values) + setattr(namespace, "typefilter", items) + + +class TransformAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + items = getattr(namespace, "trans", []) + # This function mimics what src/transform.c from tar does + if not values.startswith("s"): + raise ValueError("regex must start with an 's'") + if len(values) <= 4: + # minimum regex: s/x// + raise ValueError("invalid regex (too short)") + d = values[1] + if values.startswith(f"s{d}{d}"): + raise ValueError("empty regex") + values = values.removeprefix(f"s{d}") + flags = 0 + if values.endswith(f"{d}i"): + # trailing flags + flags = re.IGNORECASE + values = values.removesuffix(f"{d}i") + # This regex only finds non-empty tokens. + # Finding empty tokens would require a variable length look-behind + # or \K in order to find escaped delimiters which is not supported by + # the python re module. + tokens = re.findall(rf"(?:\\[\\{d}]|[^{d}])+", values) + match len(tokens): + case 0: + raise ValueError("invalid regex: not enough terms") + case 1: + repl = "" + case 2: + repl = tokens[1] + case _: + raise ValueError("invalid regex: too many terms: %s" % tokens) + items.append((re.compile(tokens[0], flags), repl)) + setattr(namespace, "trans", items) + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""\ +Filters a tarball on standard input by the same rules as the dpkg --path-exclude +and --path-include options and writes resulting tarball to standard output. See +dpkg(1) for information on how these two options work in detail. To reuse the +exact same semantics as used by dpkg, paths must be given as /path and not as +./path even though they might be stored as such in the tarball. + +Secondly, filter out unwanted pax extended headers using --pax-exclude and +--pax-include. This is useful in cases where a tool only accepts certain xattr +prefixes. For example tar2sqfs only supports SCHILY.xattr.user.*, +SCHILY.xattr.trusted.* and SCHILY.xattr.security.* but not +SCHILY.xattr.system.posix_acl_default.*. + +Both types of options use Unix shell-style wildcards: + + * matches everything + ? matches any single character + [seq] matches any character in seq + [!seq] matches any character not in seq + +Thirdly, filter out files matching a specific tar archive member type using +--type-exclude. Valid type names are REGTYPE (regular file), LNKTYPE +(hardlink), SYMTYPE (symlink), CHRTYPE (character special), BLKTYPE (block +special), DIRTYPE (directory), FIFOTYPE (fifo) or their tar format flag value +(0-6, respectively). + +Fourthly, transform the path of tar members using a sed expression just as with +GNU tar --transform. + +Fifthly, strip leading directory components off of tar members. Just as with +GNU tar --strip-components, tar members that have less or equal components in +their path are not passed through. + +Lastly, shift user id and group id of each entry by the value given by the +--idshift argument. The resulting uid or gid must not be negative. +""", + ) + parser.add_argument( + "--path-exclude", + metavar="pattern", + action=PathFilterAction, + help="Exclude path matching the given shell pattern. " + "This option can be specified multiple times.", + ) + parser.add_argument( + "--path-include", + metavar="pattern", + action=PathFilterAction, + help="Re-include a pattern after a previous exclusion. " + "This option can be specified multiple times.", + ) + parser.add_argument( + "--pax-exclude", + metavar="pattern", + action=PaxFilterAction, + help="Exclude pax header matching the given globbing pattern. " + "This option can be specified multiple times.", + ) + parser.add_argument( + "--pax-include", + metavar="pattern", + action=PaxFilterAction, + help="Re-include a pax header after a previous exclusion. " + "This option can be specified multiple times.", + ) + parser.add_argument( + "--type-exclude", + metavar="type", + action=TypeFilterAction, + help="Exclude certain member types by their type. Choose types either " + "by their name (REGTYPE, LNKTYPE, SYMTYPE, CHRTYPE, BLKTYPE, DIRTYPE, " + "FIFOTYPE) or by their tar format flag values (0-6, respectively). " + "This option can be specified multiple times.", + ) + parser.add_argument( + "--transform", + "--xform", + metavar="EXPRESSION", + action=TransformAction, + help="Use sed replace EXPRESSION to transform file names. " + "This option can be specified multiple times.", + ) + parser.add_argument( + "--strip-components", + metavar="NUMBER", + type=int, + help="Strip NUMBER leading components from file names", + ) + parser.add_argument( + "--idshift", + metavar="NUM", + type=int, + help="Integer value by which to shift the uid and gid of each entry", + ) + args = parser.parse_args() + if ( + not hasattr(args, "pathfilter") + and not hasattr(args, "paxfilter") + and not hasattr(args, "typefilter") + and not hasattr(args, "strip_components") + ): + from shutil import copyfileobj + + copyfileobj(sys.stdin.buffer, sys.stdout.buffer) + exit() + + # same logic as in dpkg/src/filters.c/filter_should_skip() + prefix_prog = re.compile(r"^([^*?[\\]*).*") + + def path_filter_should_skip(member): + skip = False + if not hasattr(args, "pathfilter"): + return False + for t, r in args.pathfilter: + if r.match(member.name[1:]) is not None: + if t == "path_include": + skip = False + else: + skip = True + if skip and (member.isdir() or member.issym()): + for t, r in args.pathfilter: + if t != "path_include": + continue + prefix = prefix_prog.sub(r"\1", r.pattern) + prefix = prefix.rstrip("/") + if member.name[1:].startswith(prefix): + return False + return skip + + def pax_filter_should_skip(header): + if not hasattr(args, "paxfilter"): + return False + skip = False + for t, r in args.paxfilter: + if r.match(header) is None: + continue + if t == "pax_include": + skip = False + else: + skip = True + return skip + + def type_filter_should_skip(member): + if not hasattr(args, "typefilter"): + return False + for t in args.typefilter: + if member.type == t: + return True + return False + + # starting with Python 3.8, the default format became PAX_FORMAT but we + # are still explicit here in case of future changes. + with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open( + fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT + ) as out_tar: + for member in in_tar: + if path_filter_should_skip(member): + continue + if type_filter_should_skip(member): + continue + if args.strip_components: + comps = member.name.split("/") + # just as with GNU tar, archive members with less or equal + # number of components are not passed through at all + if len(comps) <= args.strip_components: + continue + member.name = "/".join(comps[args.strip_components :]) + member.pax_headers = { + k: v + for k, v in member.pax_headers.items() + if not pax_filter_should_skip(k) + } + if args.idshift: + if args.idshift < 0 and -args.idshift > member.uid: + print("uid cannot be negative", file=sys.stderr) + exit(1) + if args.idshift < 0 and -args.idshift > member.gid: + print("gid cannot be negative", file=sys.stderr) + exit(1) + member.uid += args.idshift + member.gid += args.idshift + if hasattr(args, "trans"): + for r, s in args.trans: + member.name = r.sub(s, member.name) + if member.isfile(): + with in_tar.extractfile(member) as file: + out_tar.addfile(member, file) + else: + out_tar.addfile(member) + + +if __name__ == "__main__": + main() |