summaryrefslogtreecommitdiffstats
path: root/tarfilter
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 14:14:39 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 14:14:39 +0000
commitee17e45964b786b48b455959dfe68715971893fb (patch)
tree118f40aa65dc838499053413b05adfd00f839c62 /tarfilter
parentInitial commit. (diff)
downloadmmdebstrap-ee17e45964b786b48b455959dfe68715971893fb.tar.xz
mmdebstrap-ee17e45964b786b48b455959dfe68715971893fb.zip
Adding upstream version 1.4.3.upstream/1.4.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tarfilter')
-rwxr-xr-xtarfilter300
1 files changed, 300 insertions, 0 deletions
diff --git a/tarfilter b/tarfilter
new file mode 100755
index 0000000..66ef229
--- /dev/null
+++ b/tarfilter
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+#
+# This script is in the public domain
+#
+# Author: Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
+#
+# This script accepts a tarball on standard input and filters it according to
+# the same rules used by dpkg --path-exclude and --path-include, using command
+# line options of the same name. The result is then printed on standard output.
+#
+# A tool like this should be written in C but libarchive has issues:
+# https://github.com/libarchive/libarchive/issues/587
+# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1)
+# Should these issues get fixed, then a good template is tarfilter.c in the
+# examples directory of libarchive.
+#
+# We are not using Perl either, because Archive::Tar slurps the whole tarball
+# into memory.
+#
+# We could also use Go but meh...
+# https://stackoverflow.com/a/59542307/784669
+
+import tarfile
+import sys
+import argparse
+import fnmatch
+import re
+
+
+class PathFilterAction(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ items = getattr(namespace, "pathfilter", [])
+ regex = re.compile(fnmatch.translate(values))
+ items.append((self.dest, regex))
+ setattr(namespace, "pathfilter", items)
+
+
+class PaxFilterAction(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ items = getattr(namespace, "paxfilter", [])
+ regex = re.compile(fnmatch.translate(values))
+ items.append((self.dest, regex))
+ setattr(namespace, "paxfilter", items)
+
+
+class TypeFilterAction(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ items = getattr(namespace, "typefilter", [])
+ match values:
+ case "REGTYPE" | "0":
+ items.append(tarfile.REGTYPE)
+ case "LNKTYPE" | "1":
+ items.append(tarfile.LNKTYPE)
+ case "SYMTYPE" | "2":
+ items.append(tarfile.SYMTYPE)
+ case "CHRTYPE" | "3":
+ items.append(tarfile.CHRTYPE)
+ case "BLKTYPE" | "4":
+ items.append(tarfile.BLKTYPE)
+ case "DIRTYPE" | "5":
+ items.append(tarfile.DIRTYPE)
+ case "FIFOTYPE" | "6":
+ items.append(tarfile.FIFOTYPE)
+ case _:
+ raise ValueError("invalid type: %s" % values)
+ setattr(namespace, "typefilter", items)
+
+
+class TransformAction(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ items = getattr(namespace, "trans", [])
+ # This function mimics what src/transform.c from tar does
+ if not values.startswith("s"):
+ raise ValueError("regex must start with an 's'")
+ if len(values) <= 4:
+ # minimum regex: s/x//
+ raise ValueError("invalid regex (too short)")
+ d = values[1]
+ if values.startswith(f"s{d}{d}"):
+ raise ValueError("empty regex")
+ values = values.removeprefix(f"s{d}")
+ flags = 0
+ if values.endswith(f"{d}i"):
+ # trailing flags
+ flags = re.IGNORECASE
+ values = values.removesuffix(f"{d}i")
+ # This regex only finds non-empty tokens.
+ # Finding empty tokens would require a variable length look-behind
+ # or \K in order to find escaped delimiters which is not supported by
+ # the python re module.
+ tokens = re.findall(rf"(?:\\[\\{d}]|[^{d}])+", values)
+ match len(tokens):
+ case 0:
+ raise ValueError("invalid regex: not enough terms")
+ case 1:
+ repl = ""
+ case 2:
+ repl = tokens[1]
+ case _:
+ raise ValueError("invalid regex: too many terms: %s" % tokens)
+ items.append((re.compile(tokens[0], flags), repl))
+ setattr(namespace, "trans", items)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""\
+Filters a tarball on standard input by the same rules as the dpkg --path-exclude
+and --path-include options and writes resulting tarball to standard output. See
+dpkg(1) for information on how these two options work in detail. To reuse the
+exact same semantics as used by dpkg, paths must be given as /path and not as
+./path even though they might be stored as such in the tarball.
+
+Secondly, filter out unwanted pax extended headers using --pax-exclude and
+--pax-include. This is useful in cases where a tool only accepts certain xattr
+prefixes. For example tar2sqfs only supports SCHILY.xattr.user.*,
+SCHILY.xattr.trusted.* and SCHILY.xattr.security.* but not
+SCHILY.xattr.system.posix_acl_default.*.
+
+Both types of options use Unix shell-style wildcards:
+
+ * matches everything
+ ? matches any single character
+ [seq] matches any character in seq
+ [!seq] matches any character not in seq
+
+Thirdly, filter out files matching a specific tar archive member type using
+--type-exclude. Valid type names are REGTYPE (regular file), LNKTYPE
+(hardlink), SYMTYPE (symlink), CHRTYPE (character special), BLKTYPE (block
+special), DIRTYPE (directory), FIFOTYPE (fifo) or their tar format flag value
+(0-6, respectively).
+
+Fourthly, transform the path of tar members using a sed expression just as with
+GNU tar --transform.
+
+Fifthly, strip leading directory components off of tar members. Just as with
+GNU tar --strip-components, tar members that have less or equal components in
+their path are not passed through.
+
+Lastly, shift user id and group id of each entry by the value given by the
+--idshift argument. The resulting uid or gid must not be negative.
+""",
+ )
+ parser.add_argument(
+ "--path-exclude",
+ metavar="pattern",
+ action=PathFilterAction,
+ help="Exclude path matching the given shell pattern. "
+ "This option can be specified multiple times.",
+ )
+ parser.add_argument(
+ "--path-include",
+ metavar="pattern",
+ action=PathFilterAction,
+ help="Re-include a pattern after a previous exclusion. "
+ "This option can be specified multiple times.",
+ )
+ parser.add_argument(
+ "--pax-exclude",
+ metavar="pattern",
+ action=PaxFilterAction,
+ help="Exclude pax header matching the given globbing pattern. "
+ "This option can be specified multiple times.",
+ )
+ parser.add_argument(
+ "--pax-include",
+ metavar="pattern",
+ action=PaxFilterAction,
+ help="Re-include a pax header after a previous exclusion. "
+ "This option can be specified multiple times.",
+ )
+ parser.add_argument(
+ "--type-exclude",
+ metavar="type",
+ action=TypeFilterAction,
+ help="Exclude certain member types by their type. Choose types either "
+ "by their name (REGTYPE, LNKTYPE, SYMTYPE, CHRTYPE, BLKTYPE, DIRTYPE, "
+ "FIFOTYPE) or by their tar format flag values (0-6, respectively). "
+ "This option can be specified multiple times.",
+ )
+ parser.add_argument(
+ "--transform",
+ "--xform",
+ metavar="EXPRESSION",
+ action=TransformAction,
+ help="Use sed replace EXPRESSION to transform file names. "
+ "This option can be specified multiple times.",
+ )
+ parser.add_argument(
+ "--strip-components",
+ metavar="NUMBER",
+ type=int,
+ help="Strip NUMBER leading components from file names",
+ )
+ parser.add_argument(
+ "--idshift",
+ metavar="NUM",
+ type=int,
+ help="Integer value by which to shift the uid and gid of each entry",
+ )
+ args = parser.parse_args()
+ if (
+ not hasattr(args, "pathfilter")
+ and not hasattr(args, "paxfilter")
+ and not hasattr(args, "typefilter")
+ and not hasattr(args, "strip_components")
+ ):
+ from shutil import copyfileobj
+
+ copyfileobj(sys.stdin.buffer, sys.stdout.buffer)
+ exit()
+
+ # same logic as in dpkg/src/filters.c/filter_should_skip()
+ prefix_prog = re.compile(r"^([^*?[\\]*).*")
+
+ def path_filter_should_skip(member):
+ skip = False
+ if not hasattr(args, "pathfilter"):
+ return False
+ for t, r in args.pathfilter:
+ if r.match(member.name[1:]) is not None:
+ if t == "path_include":
+ skip = False
+ else:
+ skip = True
+ if skip and (member.isdir() or member.issym()):
+ for t, r in args.pathfilter:
+ if t != "path_include":
+ continue
+ prefix = prefix_prog.sub(r"\1", r.pattern)
+ prefix = prefix.rstrip("/")
+ if member.name[1:].startswith(prefix):
+ return False
+ return skip
+
+ def pax_filter_should_skip(header):
+ if not hasattr(args, "paxfilter"):
+ return False
+ skip = False
+ for t, r in args.paxfilter:
+ if r.match(header) is None:
+ continue
+ if t == "pax_include":
+ skip = False
+ else:
+ skip = True
+ return skip
+
+ def type_filter_should_skip(member):
+ if not hasattr(args, "typefilter"):
+ return False
+ for t in args.typefilter:
+ if member.type == t:
+ return True
+ return False
+
+ # starting with Python 3.8, the default format became PAX_FORMAT but we
+ # are still explicit here in case of future changes.
+ with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open(
+ fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT
+ ) as out_tar:
+ for member in in_tar:
+ if path_filter_should_skip(member):
+ continue
+ if type_filter_should_skip(member):
+ continue
+ if args.strip_components:
+ comps = member.name.split("/")
+ # just as with GNU tar, archive members with less or equal
+ # number of components are not passed through at all
+ if len(comps) <= args.strip_components:
+ continue
+ member.name = "/".join(comps[args.strip_components :])
+ member.pax_headers = {
+ k: v
+ for k, v in member.pax_headers.items()
+ if not pax_filter_should_skip(k)
+ }
+ if args.idshift:
+ if args.idshift < 0 and -args.idshift > member.uid:
+ print("uid cannot be negative", file=sys.stderr)
+ exit(1)
+ if args.idshift < 0 and -args.idshift > member.gid:
+ print("gid cannot be negative", file=sys.stderr)
+ exit(1)
+ member.uid += args.idshift
+ member.gid += args.idshift
+ if hasattr(args, "trans"):
+ for r, s in args.trans:
+ member.name = r.sub(s, member.name)
+ if member.isfile():
+ with in_tar.extractfile(member) as file:
+ out_tar.addfile(member, file)
+ else:
+ out_tar.addfile(member)
+
+
+if __name__ == "__main__":
+ main()