summaryrefslogtreecommitdiffstats
path: root/samples/mhtml-tool
diff options
context:
space:
mode:
Diffstat (limited to '')
-rwxr-xr-xsamples/mhtml-tool568
1 files changed, 568 insertions, 0 deletions
diff --git a/samples/mhtml-tool b/samples/mhtml-tool
new file mode 100755
index 0000000..aaf7f70
--- /dev/null
+++ b/samples/mhtml-tool
@@ -0,0 +1,568 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+$| = 1;
+
+=pod
+
+=head1 NAME
+
+B<mhtml-tool> - Unpack a MIME HTML archive
+
+
+=head1 SYNOPSIS
+
+B<mhtml-tool> unpacks MIME HTML archives that some browsers (such as Opera)
+save by default. The file extensions of such archives are ".mht" or ".mhtml".
+
+The first HTML file in the archive is taken to be the primary web page, the
+other contained files for "page requisites" such as images or frames. The
+primary web page is written to the output directory (the current directory by
+default), the requisites to a subdirectory named after the primary HTML file
+name without extension, with "_files" appended. Link URLs in all HTML
+files referring to requisites are rewritten to point to the saved files.
+
+
+=head1 OPTIONS
+
+=over
+
+=item B<-e> program
+
+=item B<--exec> program
+
+Unpack the archive into a temporary directory and execute the given program
+in that directory, passing the primary HTML (first text/html entry) of the
+archive as a parameter.
+
+If B<-o> is given, use that directory location with the program.
+
+If the program is "lynx" or "(e)links(2)", this script uses the "-force-html"
+option to help when the entrypoint lacks a ".html" file-suffix.
+
+=item B<-h>
+
+=item B<-?>
+
+=item B<--help>
+
+Print a brief usage summary.
+
+=item B<-l>
+
+=item B<--list>
+
+List archive contents instead of unpacking. Four columns are output: file
+name, MIME type, size and URL. Unavailable entries are replaced by "(?)".
+
+=item B<-o> I<directory/ or name>
+
+=item B<--output> I<directory/ or name>
+
+If the argument ends in a slash or is an existing directory, unpack to that
+directory instead of current directory. Otherwise the argument is taken as a
+path to the file name to write the primary HTML file to. If the output
+directory does not exist, it is created.
+
+=back
+
+
+=head1 SEE ALSO
+
+https://github.com/ThomasDickey/mhtml-tool
+
+https://github.com/clapautius/mhtml-tool
+
+http://www.volkerschatz.com/unix/uware/unmht.html
+
+=head1 COPYLEFT
+
+B<unmht> is Copyright (c) 2012 Volker Schatz. It may be copied and/or
+modified under the same terms as Perl.
+
+B<mhtml-tool> is Copyright (c) 2018 Tudor M. Pristavu. It may be copied and/or
+modified under the same terms as Perl.
+
+B<mhtml-tool> is Copyright (c) 2024 Thomas E. Dickey. It may be copied and/or
+modified under the same terms as Perl.
+
+=cut
+
+use Cwd;
+use File::Path;
+use File::Copy;
+use File::Glob;
+use File::Temp qw/ tempdir /;
+use URI;
+use MIME::Base64;
+use MIME::QuotedPrint;
+use HTML::PullParser;
+use HTML::Tagset;
+use Getopt::Long;
+
+our $path_limit = 255;
+our %shorten_path;
+
+# RFC 7230 (obsoletes RFC 2616) indicates that URI length is limited by the
+# server line-length, which it suggests should be at least 8000 octets. The
+# problem is that components of pathnames might be limited to 255 bytes, while
+# the URI components can be much longer. As an additional complication, the
+# too-long components can be both directory- and file-names, and truncating
+# the components can produce collisions.
+sub shorten_path {
+ my $actual = $_[0];
+ if ( not defined $shorten_path{$actual} ) {
+ my @parts = split /\//, $actual;
+ for my $n ( 0 .. $#parts ) {
+ $parts[$n] = substr( $parts[$n], 0, $path_limit );
+ }
+ $shorten_path{$actual} = join "/", @parts;
+ }
+ return $shorten_path{$actual};
+}
+
+# URLs can include "&" and other characters which require quoting to use in
+# a shell command. We use system() rather than exec() to allow the temporary
+# directory to be cleaned up after running lynx.
+sub quoted {
+ my $param = $_[0];
+ $param =~ s/'/'\"'\"'/g;
+ return sprintf( "'%s'", $param );
+}
+
+# Add approriate ordinal suffix to a number.
+# -> Number
+# <- String of number with ordinal suffix
+sub ordinal {
+ return $_[0] . "th" if $_[0] > 3 && $_[0] < 20;
+ my $unitdig = $_[0] % 10;
+ return $_[0] . "st" if $unitdig == 1;
+ return $_[0] . "nd" if $unitdig == 2;
+ return $_[0] . "rd" if $unitdig == 3;
+ return $_[0] . "th";
+}
+
+{
+ my %taken;
+
+ # Find unique file name.
+ # -> Preferred file name, or undef
+ # MHT archive name (as a fallback if no name given)
+ # <- File name not conflicting with names returned by previous calls, but which
+ # may exist!
+ sub unique_name {
+ my ( $fname, $mhtname ) = @_;
+ my ( $trunc, $ext );
+
+ if ( defined $fname ) {
+ $fname =~ s/^\s+//;
+ $fname =~ s/\s+$//;
+ $taken{$fname} = 1, return $fname unless $taken{$fname};
+ ( $trunc, $ext ) = $fname =~ /^(.*?)(\.\w+)?$/;
+ $ext //= "";
+ }
+ else {
+ $trunc = $mhtname || "unpack";
+ $trunc =~ s/\.mht(?:ml?)?$//i;
+ $ext = "";
+ }
+ for my $suff ( 1 .. 9999 ) {
+ $fname = "${trunc}_$suff$ext";
+ $taken{$fname} = 1, return $fname unless $taken{$fname};
+ ++$suff;
+ }
+ return undef;
+ }
+
+}
+
+# Output error message and exit with return value 1.
+sub abort {
+ print STDERR "$_[0]\n";
+ exit 1;
+}
+
+# Generate output file directories and primary HTML file name depending on
+# --output option and primary HTML file name from archive. In case the primary
+# HTML file is not the first file in the archive, the secondary files directory
+# is renamed or the files moved on the second call, when the primary HTML file
+# name is known.
+# -> Value of --output option (or undef)
+# Primary HTML file name from MHT archive
+# Flag indicating if .._files subdirectory should be created
+# Hash reference to store resulting paths to
+sub mkfiledir {
+ my ( $outputopt, $firsthtmlname, $needfilesdir, $out ) = @_;
+
+ my $prevfilespath = $$out{filespath};
+ $firsthtmlname = "unpackmht-$$" unless defined $firsthtmlname;
+ if ( !defined $outputopt ) {
+ $$out{toppath} = ".";
+ }
+ elsif ( -d $outputopt || $outputopt =~ m!/$! ) {
+ $$out{toppath} = $outputopt;
+ }
+ else {
+ ( $$out{toppath}, $firsthtmlname ) = $outputopt =~ m!^(.*/)?([^/]+)$!;
+ abort "Empty output file name." unless defined $firsthtmlname;
+ $firsthtmlname .= ".html" unless $firsthtmlname =~ /\./;
+ $$out{toppath} = "." unless defined $$out{toppath};
+ }
+ $$out{toppath} =~ s!/$!!;
+ $$out{firstout} = "$$out{toppath}/$firsthtmlname";
+ $$out{filesdir} = $firsthtmlname;
+ $$out{filesdir} =~ s/\.[^.]+$//;
+ $$out{filesdir} = substr( $$out{filesdir}, 0, $path_limit - 6 );
+ $$out{filesdir} .= "_files";
+ $$out{filespath} = "$$out{toppath}/$$out{filesdir}";
+
+ if ( defined $prevfilespath ) {
+ return unless $prevfilespath ne $$out{filespath};
+ return unless -d $prevfilespath;
+ if ( !-d $$out{filespath} ) {
+ File::Copy::move( $prevfilespath, $$out{filespath} )
+ or abort "Could not rename secondary files directory.";
+ }
+ else {
+ for ( File::Glob::bsd_glob("$prevfilespath/*") ) {
+ File::Copy::move( $_, $$out{filespath} )
+ or abort "Could not move secondary files.";
+ }
+ }
+ }
+ else {
+ my $createall = $needfilesdir ? $$out{filespath} : $$out{toppath};
+ if ( !-d $createall ) {
+ File::Path::make_path($createall)
+ or abort "Could not create output directory $createall.";
+ }
+ }
+}
+
+my %opt;
+my @optdescr = ( 'exec|e=s', 'output|o=s', 'list|l!', 'help|h|?!', 'debug|d!' );
+my %config;
+
+my $status = GetOptions( \%opt, @optdescr );
+
+if ( !$status || $opt{help} ) {
+ print <<EOF;
+Usage: mhtml-tool [options] <MHT file>
+
+By default, mhtml-tool unpacks an MHT archive (an archive type saved by some
+browsers) to the current directory. The first HTML file in the archive is
+taken for the primary web page, and all other contained files are written to a
+directory named after that HTML file.
+
+Options:
+-e, --exec Execute the given program on a temporarily-unpacked archive
+-l, --list List archive contents (file name, MIME type, size and URL)
+-o, --output Unpack to directory <dir/> or to file <name>.html
+
+Use the command "pod2man mhtml-tool > mhtml-tool.1" or
+"pod2html mhtml-tool > mhtml-tool.html" to extract the manual.
+EOF
+ exit !$status;
+}
+
+my $origin = getcwd;
+my $tempdir = "";
+if ( $opt{exec} and not defined $opt{output} ) {
+ $tempdir = tempdir( CLEANUP => 1 );
+ $opt{output} = $tempdir . "/";
+}
+
+my $orig_sep = $/;
+my $crlf_file = 0;
+
+# Print a message to stdout (if debug is enabled).
+sub debug_msg {
+ if ( $opt{debug} ) {
+ print ":debug: $_[0]\n";
+ }
+}
+
+# Read next line. Remove line endings (both unix & windows style).
+sub read_next_line {
+ my $cur_line = <>;
+ chomp $cur_line;
+ if ( $cur_line =~ /\r$/ ) {
+ debug_msg("It's a CRLF file") if ( $crlf_file == 0 );
+ $crlf_file++;
+ }
+ $cur_line =~ s/\r//;
+ return $cur_line;
+}
+
+# Parse headers.
+# Handle multi-line headers (lines starting with spaces are part of a
+# multi-line header).
+# Read data from <> until the first empty line.
+# Set $crlf_file to 1 if it's a CRLF (windoze) file.
+sub parse_headers {
+ my %headers;
+ my $sep = $/;
+ $/ = $orig_sep;
+ my $cur_line = read_next_line;
+ my $full_line = $cur_line;
+
+ while ( not( $cur_line =~ /^$/ ) ) {
+ if ( $cur_line =~ /;$/ ) { # if a continued line...
+ while ( ( $cur_line = read_next_line ) =~ /^\h+/ ) {
+ $cur_line =~ s/^\h+//;
+ $full_line = $full_line . " " . $cur_line;
+ }
+ if ( $full_line =~ s/^([-\w]+): (.*)$// ) {
+ $headers{$1} = $2;
+ debug_msg("(1) New header $1 with value $2");
+ }
+ }
+ else {
+ if ( $full_line =~ s/^([-\w]+): (.*)$// ) {
+ $headers{$1} = $2;
+ debug_msg("(2) New header $1 with value $2");
+ }
+ }
+ $cur_line = read_next_line;
+ $full_line = $cur_line;
+ }
+ $/ = $sep;
+ return %headers;
+}
+
+my %global_headers = parse_headers();
+
+abort "Can't find Content-Type header - not a MIME HTML file?"
+ if ( !defined( $global_headers{'Content-Type'} ) );
+debug_msg("Global Content-Type: $global_headers{'Content-Type'}");
+
+my $boundary = '';
+my $endcode = '';
+
+# FIXME: - add other possible mime types
+# Types implemented so far are:
+# Content-Type: multipart/related; boundary="----=_NextPart_01D8BCC9.47C2B260"
+# Content-Type: text/html; charset="utf-8"
+if ( $global_headers{'Content-Type'} =~
+ m!multipart/related;.*\h+boundary="?([^"]*)"?$! )
+{
+ $endcode = $boundary = $1;
+ $endcode =~ s/\s+$//;
+ if ($crlf_file) {
+ $/ = "\r\n--$boundary\r\n";
+ }
+ else {
+ $/ = "\n--$boundary\n";
+ }
+ debug_msg("Boundary: $boundary");
+}
+elsif ( $global_headers{'Content-Type'} =~ m!text/html! ) {
+ $/ = '';
+}
+else {
+ die "Error: Unknown Content-Type: $global_headers{'Content-Type'}\n";
+}
+
+my %by_url;
+my @htmlfiles;
+my $fh;
+
+{
+ my $fileind = 1;
+ my $leading = "--$boundary\n";
+ while ( defined( my $data = <> ) ) {
+ chomp $data;
+ $data =~ s/\R/\n/g; # handle various other line-endings (windows, mac)
+ while ( index( $data, $leading ) == 0 ) {
+ $data = substr( $data, length($leading) );
+ }
+ my %headers;
+ while ( $data =~ s/^([-\w]+): (.*)\n// ) {
+ $headers{$1} = $2;
+ debug_msg("New header $1 with value $2");
+
+ # read (and ignore atm) lines starting with space (multi-line headers)
+ while ( $data =~ s/^\h+.*\n// ) {
+ debug_msg("empty line");
+ }
+ }
+ if ( scalar(%headers) == 0 ) {
+ %headers = %global_headers
+ ; # fallback to the global headers as needed, usually for "text"
+ }
+ $data =~ s/^\n//;
+ $data =~ s/\n--$endcode--\r?\n$/\n/s;
+ my ( $type, $origname );
+ if ( defined( $headers{"Content-Type"} ) ) {
+ ($type) = $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/;
+ debug_msg("type=$type");
+ }
+ else {
+ print "Error: No Content-Type found, skipping chunk\n";
+
+ # we could print the bad chunk here...
+ # but it's probably just a "warning" about
+ # "if you see this your software isn't recognizing a web archive file"
+ next;
+ }
+ if ( defined( $headers{"Content-Type"} )
+ && $headers{"Content-Type"} =~ /\bname=([^;]*)/ )
+ {
+ $origname = $1;
+ ($type) = $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/;
+ $type //= "";
+ }
+ elsif ( defined( $headers{"Content-Disposition"} )
+ && $headers{"Content-Disposition"} =~ /\bfilename=([^;]*)/ )
+ {
+ $origname = $1;
+ if ( !defined($type) ) {
+ $type = $origname =~ /\.html?$/i ? "text/html" : "";
+ }
+ }
+ elsif ( defined( $headers{"Content-Location"} ) ) {
+ $origname = $headers{"Content-Location"};
+
+ # for unknown reasons, files generated by IE11 may contain some
+ # local paths with '\' instead of '/'
+ if ( $origname =~ m!^file://.*\\! ) {
+ debug_msg("Windows-style path detected: $origname");
+ $origname =~ s!^.*\\!!;
+ }
+ $origname =~ s!^.*/!!;
+ if ( !defined($type) ) {
+ $type = "";
+ }
+ }
+
+ $origname = "noname" unless defined $origname;
+ $origname = "noname" if ( $origname eq "" );
+ my $fname = unique_name( $origname, $ARGV[0] );
+ if ( !defined( $headers{"Content-Transfer-Encoding"} ) ) {
+ print STDERR "Info: Encoding of ", ordinal($fileind),
+ " file not found - leaving as-is.\n";
+ }
+ elsif ( $headers{"Content-Transfer-Encoding"} =~ /\bbase64\b/i ) {
+ $data = MIME::Base64::decode($data);
+ }
+ elsif (
+ $headers{"Content-Transfer-Encoding"} =~ /\bquoted-printable\b/i )
+ {
+ $data = MIME::QuotedPrint::decode($data);
+ }
+ debug_msg("origname=$origname; fname=$fname; type=$type");
+ debug_msg( "Content-Type: " . $headers{"Content-Type"} );
+ debug_msg( "Data size: " . length($data) );
+ if ( $opt{list} ) {
+ $origname =~ s/\s+$// if defined $origname;
+ my $size = length($data);
+ print $fname // "(?)", "\t", $type || "(?)", "\t$size\t",
+ $headers{"Content-Location"} // "(?)", "\n";
+ next;
+ }
+ $headers{fname} = $fname;
+ if ( $headers{"Content-Location"} ) {
+ $headers{url} = $headers{"Content-Location"};
+ $headers{url} =~ s/\s+$//;
+ $by_url{ $headers{url} } = \%headers;
+ debug_msg("Content-Location: $headers{url}");
+ }
+ else {
+ debug_msg("?? no location");
+ }
+ if ( $type eq "text/html" ) {
+ $headers{data} = $data;
+ if ( scalar @htmlfiles == 0 ) { # first html file must have a name
+ if ( !$headers{fname} ) {
+ $headers{fname} = "index.html";
+ }
+ }
+ push @htmlfiles, \%headers;
+ debug_msg("New html file in list: $headers{fname}");
+ }
+ else {
+ mkfiledir( $opt{output}, $htmlfiles[0]->{fname}, 1, \%config );
+ $fname = shorten_path "$config{filespath}/$fname";
+ open $fh, ">$fname" or abort "Could not create file $fname.";
+ print $fh $data;
+ close $fh;
+ }
+ }
+ continue {
+ debug_msg("");
+ ++$fileind;
+ }
+}
+
+if ( $opt{list} ) {
+ exit(0);
+}
+
+mkfiledir( $opt{output}, $htmlfiles[0]->{fname}, 0, \%config );
+
+my $entrypoint = ".";
+my $filesprefix = $config{filesdir} . "/";
+my $outname = $config{firstout};
+print "primary html output name: $outname\n";
+
+for my $html (@htmlfiles) {
+ my $linksubst = "";
+ my $p = HTML::PullParser->new(
+ doc => \$html->{data},
+ "start" => 'text, attr, tagname',
+ "text" => 'text',
+ "end" => 'text'
+ );
+ while ( defined( my $tok = $p->get_token() ) ) {
+ my $linkary;
+ my @linkattrs;
+ if ( ref( $tok->[1] )
+ && ( $linkary = $HTML::Tagset::linkElements{ $tok->[2] } )
+ && ( @linkattrs = grep $tok->[1]->{$_}, @$linkary ) )
+ {
+ for my $attr (@linkattrs) {
+ my $uri = URI->new( $tok->[1]->{$attr} );
+ next unless defined $uri;
+ next unless defined $html->{url};
+ $uri = $uri->abs( $html->{url} );
+ $tok->[1]->{$attr} =
+ "$filesprefix" . $by_url{ $uri->as_string() }->{fname}
+ if $by_url{ $uri->as_string() };
+ }
+ delete $tok->[1]->{"/"};
+ $linksubst .=
+ "<$tok->[2] "
+ . join( " ",
+ map( "$_=\"$tok->[1]->{$_}\"", keys %{ $tok->[1] } ) )
+ . ">";
+ }
+ else {
+ $linksubst .= $tok->[0];
+ }
+ }
+ $outname = "$config{filespath}/$html->{fname}" unless defined $outname;
+ $outname = shorten_path $outname;
+ open $fh, ">$outname" or abort "Could not create file $outname.";
+ print $fh $linksubst;
+ close $fh;
+ $entrypoint = $outname if ( $entrypoint eq "." );
+
+ # for all except the first HTML file:
+ $filesprefix = "";
+ $outname = undef;
+}
+
+if ( $tempdir ne "" ) {
+ chdir $tempdir;
+ my $enforce = "";
+ $enforce = "-force-html"
+ if ( $entrypoint ne "."
+ and $entrypoint !~ /\.htm(l)?$/
+ and $opt{exec} =~ /(lynx|((e)?links(2)?))/ );
+ system( sprintf( "%s %s %s", $opt{exec}, $enforce, quoted $entrypoint ) );
+ chdir $origin;
+}
+
+1;