1 files changed, 568 insertions, 0 deletions
diff --git a/samples/mhtml-tool b/samples/mhtml-tool
new file mode 100755
index 0000000..aaf7f70
--- /dev/null
+++ b/samples/mhtml-tool
@@ -0,0 +1,568 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+$| = 1;
+
+=pod
+
+=head1 NAME
+
+B<mhtml-tool> - Unpack a MIME HTML archive
+
+
+=head1 SYNOPSIS
+
+B<mhtml-tool> unpacks MIME HTML archives that some browsers (such as Opera)
+save by default.  The file extensions of such archives are ".mht" or ".mhtml".
+
+The first HTML file in the archive is taken to be the primary web page, the
+other contained files for "page requisites" such as images or frames.  The
+primary web page is written to the output directory (the current directory by
+default), the requisites to a subdirectory named after the primary HTML file
+name without extension, with "_files" appended.  Link URLs in all HTML
+files referring to requisites are rewritten to point to the saved files.
+
+
+=head1 OPTIONS
+
+=over
+
+=item B<-e> program
+
+=item B<--exec> program
+
+Unpack the archive into a temporary directory and execute the given program
+in that directory, passing the primary HTML (first text/html entry) of the
+archive as a parameter.
+
+If B<-o> is given, use that directory location with the program.
+
+If the program is "lynx" or "(e)links(2)", this script uses the "-force-html"
+option to help when the entrypoint lacks a ".html" file-suffix.
+
+=item B<-h>
+
+=item B<-?>
+
+=item B<--help>
+
+Print a brief usage summary.
+
+=item B<-l>
+
+=item B<--list>
+
+List archive contents instead of unpacking.  Four columns are output: file
+name, MIME type, size and URL.  Unavailable entries are replaced by "(?)".
+
+=item B<-o> I<directory/ or name>
+
+=item B<--output> I<directory/ or name>
+
+If the argument ends in a slash or is an existing directory, unpack to that
+directory instead of current directory.  Otherwise the argument is taken as a
+path to the file name to write the primary HTML file to.  If the output
+directory does not exist, it is created.
+
+=back
+
+
+=head1 SEE ALSO
+
+https://github.com/ThomasDickey/mhtml-tool
+
+https://github.com/clapautius/mhtml-tool
+
+http://www.volkerschatz.com/unix/uware/unmht.html
+
+=head1 COPYLEFT
+
+B<unmht> is Copyright (c) 2012 Volker Schatz.  It may be copied and/or
+modified under the same terms as Perl.
+
+B<mhtml-tool> is Copyright (c) 2018 Tudor M. Pristavu. It may be copied and/or
+modified under the same terms as Perl.
+
+B<mhtml-tool> is Copyright (c) 2024 Thomas E. Dickey. It may be copied and/or
+modified under the same terms as Perl.
+
+=cut
+
+use Cwd;
+use File::Path;
+use File::Copy;
+use File::Glob;
+use File::Temp qw/ tempdir /;
+use URI;
+use MIME::Base64;
+use MIME::QuotedPrint;
+use HTML::PullParser;
+use HTML::Tagset;
+use Getopt::Long;
+
+our $path_limit = 255;
+our %shorten_path;
+
+# RFC 7230 (obsoletes RFC 2616) indicates that URI length is limited by the
+# server line-length, which it suggests should be at least 8000 octets.  The
+# problem is that components of pathnames might be limited to 255 bytes, while
+# the URI components can be much longer.  As an additional complication, the
+# too-long components can be both directory- and file-names, and truncating
+# the components can produce collisions.
+sub shorten_path {
+    my $actual = $_[0];
+    if ( not defined $shorten_path{$actual} ) {
+        my @parts = split /\//, $actual;
+        for my $n ( 0 .. $#parts ) {
+            $parts[$n] = substr( $parts[$n], 0, $path_limit );
+        }
+        $shorten_path{$actual} = join "/", @parts;
+    }
+    return $shorten_path{$actual};
+}
+
+# URLs can include "&" and other characters which require quoting to use in
+# a shell command.  We use system() rather than exec() to allow the temporary
+# directory to be cleaned up after running lynx.
+sub quoted {
+    my $param = $_[0];
+    $param =~ s/'/'\"'\"'/g;
+    return sprintf( "'%s'", $param );
+}
+
+# Add approriate ordinal suffix to a number.
+# -> Number
+# <- String of number with ordinal suffix
+sub ordinal {
+    return $_[0] . "th" if $_[0] > 3 && $_[0] < 20;
+    my $unitdig = $_[0] % 10;
+    return $_[0] . "st" if $unitdig == 1;
+    return $_[0] . "nd" if $unitdig == 2;
+    return $_[0] . "rd" if $unitdig == 3;
+    return $_[0] . "th";
+}
+
+{
+    my %taken;
+
+ # Find unique file name.
+ # -> Preferred file name, or undef
+ #    MHT archive name (as a fallback if no name given)
+ # <- File name not conflicting with names returned by previous calls, but which
+ #    may exist!
+    sub unique_name {
+        my ( $fname, $mhtname ) = @_;
+        my ( $trunc, $ext );
+
+        if ( defined $fname ) {
+            $fname =~ s/^\s+//;
+            $fname =~ s/\s+$//;
+            $taken{$fname} = 1, return $fname unless $taken{$fname};
+            ( $trunc, $ext ) = $fname =~ /^(.*?)(\.\w+)?$/;
+            $ext //= "";
+        }
+        else {
+            $trunc = $mhtname || "unpack";
+            $trunc =~ s/\.mht(?:ml?)?$//i;
+            $ext = "";
+        }
+        for my $suff ( 1 .. 9999 ) {
+            $fname = "${trunc}_$suff$ext";
+            $taken{$fname} = 1, return $fname unless $taken{$fname};
+            ++$suff;
+        }
+        return undef;
+    }
+
+}
+
+# Output error message and exit with return value 1.
+sub abort {
+    print STDERR "$_[0]\n";
+    exit 1;
+}
+
+# Generate output file directories and primary HTML file name depending on
+# --output option and primary HTML file name from archive.  In case the primary
+# HTML file is not the first file in the archive, the secondary files directory
+# is renamed or the files moved on the second call, when the primary HTML file
+# name is known.
+# -> Value of --output option (or  undef)
+#    Primary HTML file name from MHT archive
+#    Flag indicating if .._files subdirectory should be created
+#    Hash reference to store resulting paths to
+sub mkfiledir {
+    my ( $outputopt, $firsthtmlname, $needfilesdir, $out ) = @_;
+
+    my $prevfilespath = $$out{filespath};
+    $firsthtmlname = "unpackmht-$$" unless defined $firsthtmlname;
+    if ( !defined $outputopt ) {
+        $$out{toppath} = ".";
+    }
+    elsif ( -d $outputopt || $outputopt =~ m!/$! ) {
+        $$out{toppath} = $outputopt;
+    }
+    else {
+        ( $$out{toppath}, $firsthtmlname ) = $outputopt =~ m!^(.*/)?([^/]+)$!;
+        abort "Empty output file name." unless defined $firsthtmlname;
+        $firsthtmlname .= ".html"       unless $firsthtmlname =~ /\./;
+        $$out{toppath} = "."            unless defined $$out{toppath};
+    }
+    $$out{toppath} =~ s!/$!!;
+    $$out{firstout} = "$$out{toppath}/$firsthtmlname";
+    $$out{filesdir} = $firsthtmlname;
+    $$out{filesdir} =~ s/\.[^.]+$//;
+    $$out{filesdir} = substr( $$out{filesdir}, 0, $path_limit - 6 );
+    $$out{filesdir} .= "_files";
+    $$out{filespath} = "$$out{toppath}/$$out{filesdir}";
+
+    if ( defined $prevfilespath ) {
+        return unless $prevfilespath ne $$out{filespath};
+        return unless -d $prevfilespath;
+        if ( !-d $$out{filespath} ) {
+            File::Copy::move( $prevfilespath, $$out{filespath} )
+              or abort "Could not rename secondary files directory.";
+        }
+        else {
+            for ( File::Glob::bsd_glob("$prevfilespath/*") ) {
+                File::Copy::move( $_, $$out{filespath} )
+                  or abort "Could not move secondary files.";
+            }
+        }
+    }
+    else {
+        my $createall = $needfilesdir ? $$out{filespath} : $$out{toppath};
+        if ( !-d $createall ) {
+            File::Path::make_path($createall)
+              or abort "Could not create output directory $createall.";
+        }
+    }
+}
+
+my %opt;
+my @optdescr = ( 'exec|e=s', 'output|o=s', 'list|l!', 'help|h|?!', 'debug|d!' );
+my %config;
+
+my $status = GetOptions( \%opt, @optdescr );
+
+if ( !$status || $opt{help} ) {
+    print <<EOF;
+Usage: mhtml-tool [options] <MHT file>
+
+By default, mhtml-tool unpacks an MHT archive (an archive type saved by some
+browsers) to the current directory.  The first HTML file in the archive is
+taken for the primary web page, and all other contained files are written to a
+directory named after that HTML file.
+
+Options:
+-e, --exec    Execute the given program on a temporarily-unpacked archive
+-l, --list    List archive contents (file name, MIME type, size and URL)
+-o, --output  Unpack to directory <dir/> or to file <name>.html
+
+Use the command "pod2man mhtml-tool > mhtml-tool.1" or
+"pod2html mhtml-tool > mhtml-tool.html" to extract the manual.
+EOF
+    exit !$status;
+}
+
+my $origin  = getcwd;
+my $tempdir = "";
+if ( $opt{exec} and not defined $opt{output} ) {
+    $tempdir = tempdir( CLEANUP => 1 );
+    $opt{output} = $tempdir . "/";
+}
+
+my $orig_sep  = $/;
+my $crlf_file = 0;
+
+# Print a message to stdout (if debug is enabled).
+sub debug_msg {
+    if ( $opt{debug} ) {
+        print ":debug: $_[0]\n";
+    }
+}
+
+# Read next line. Remove line endings (both unix & windows style).
+sub read_next_line {
+    my $cur_line = <>;
+    chomp $cur_line;
+    if ( $cur_line =~ /\r$/ ) {
+        debug_msg("It's a CRLF file") if ( $crlf_file == 0 );
+        $crlf_file++;
+    }
+    $cur_line =~ s/\r//;
+    return $cur_line;
+}
+
+# Parse headers.
+# Handle multi-line headers (lines starting with spaces are part of a
+# multi-line header).
+# Read data from <> until the first empty line.
+# Set $crlf_file to 1 if it's a CRLF (windoze) file.
+sub parse_headers {
+    my %headers;
+    my $sep = $/;
+    $/ = $orig_sep;
+    my $cur_line  = read_next_line;
+    my $full_line = $cur_line;
+
+    while ( not( $cur_line =~ /^$/ ) ) {
+        if ( $cur_line =~ /;$/ ) {    # if a continued line...
+            while ( ( $cur_line = read_next_line ) =~ /^\h+/ ) {
+                $cur_line =~ s/^\h+//;
+                $full_line = $full_line . " " . $cur_line;
+            }
+            if ( $full_line =~ s/^([-\w]+): (.*)$// ) {
+                $headers{$1} = $2;
+                debug_msg("(1) New header $1 with value $2");
+            }
+        }
+        else {
+            if ( $full_line =~ s/^([-\w]+): (.*)$// ) {
+                $headers{$1} = $2;
+                debug_msg("(2) New header $1 with value $2");
+            }
+        }
+        $cur_line  = read_next_line;
+        $full_line = $cur_line;
+    }
+    $/ = $sep;
+    return %headers;
+}
+
+my %global_headers = parse_headers();
+
+abort "Can't find Content-Type header - not a MIME HTML file?"
+  if ( !defined( $global_headers{'Content-Type'} ) );
+debug_msg("Global Content-Type: $global_headers{'Content-Type'}");
+
+my $boundary = '';
+my $endcode  = '';
+
+# FIXME: - add other possible mime types
+# Types implemented so far are:
+#   Content-Type: multipart/related; boundary="----=_NextPart_01D8BCC9.47C2B260"
+#   Content-Type: text/html; charset="utf-8"
+if ( $global_headers{'Content-Type'} =~
+    m!multipart/related;.*\h+boundary="?([^"]*)"?$! )
+{
+    $endcode = $boundary = $1;
+    $endcode =~ s/\s+$//;
+    if ($crlf_file) {
+        $/ = "\r\n--$boundary\r\n";
+    }
+    else {
+        $/ = "\n--$boundary\n";
+    }
+    debug_msg("Boundary: $boundary");
+}
+elsif ( $global_headers{'Content-Type'} =~ m!text/html! ) {
+    $/ = '';
+}
+else {
+    die "Error: Unknown Content-Type: $global_headers{'Content-Type'}\n";
+}
+
+my %by_url;
+my @htmlfiles;
+my $fh;
+
+{
+    my $fileind = 1;
+    my $leading = "--$boundary\n";
+    while ( defined( my $data = <> ) ) {
+        chomp $data;
+        $data =~ s/\R/\n/g;   # handle various other line-endings (windows, mac)
+        while ( index( $data, $leading ) == 0 ) {
+            $data = substr( $data, length($leading) );
+        }
+        my %headers;
+        while ( $data =~ s/^([-\w]+): (.*)\n// ) {
+            $headers{$1} = $2;
+            debug_msg("New header $1 with value $2");
+
+          # read (and ignore atm) lines starting with space (multi-line headers)
+            while ( $data =~ s/^\h+.*\n// ) {
+                debug_msg("empty line");
+            }
+        }
+        if ( scalar(%headers) == 0 ) {
+            %headers = %global_headers
+              ;   # fallback to the global headers as needed, usually for "text"
+        }
+        $data =~ s/^\n//;
+        $data =~ s/\n--$endcode--\r?\n$/\n/s;
+        my ( $type, $origname );
+        if ( defined( $headers{"Content-Type"} ) ) {
+            ($type) = $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/;
+            debug_msg("type=$type");
+        }
+        else {
+            print "Error: No Content-Type found, skipping chunk\n";
+
+        # we could print the bad chunk here...
+        # but it's probably just a "warning" about
+        #   "if you see this your software isn't recognizing a web archive file"
+            next;
+        }
+        if ( defined( $headers{"Content-Type"} )
+            && $headers{"Content-Type"} =~ /\bname=([^;]*)/ )
+        {
+            $origname = $1;
+            ($type) = $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/;
+            $type //= "";
+        }
+        elsif ( defined( $headers{"Content-Disposition"} )
+            && $headers{"Content-Disposition"} =~ /\bfilename=([^;]*)/ )
+        {
+            $origname = $1;
+            if ( !defined($type) ) {
+                $type = $origname =~ /\.html?$/i ? "text/html" : "";
+            }
+        }
+        elsif ( defined( $headers{"Content-Location"} ) ) {
+            $origname = $headers{"Content-Location"};
+
+            # for unknown reasons, files generated by IE11 may contain some
+            # local paths with '\' instead of '/'
+            if ( $origname =~ m!^file://.*\\! ) {
+                debug_msg("Windows-style path detected: $origname");
+                $origname =~ s!^.*\\!!;
+            }
+            $origname =~ s!^.*/!!;
+            if ( !defined($type) ) {
+                $type = "";
+            }
+        }
+
+        $origname = "noname" unless defined $origname;
+        $origname = "noname" if ( $origname eq "" );
+        my $fname = unique_name( $origname, $ARGV[0] );
+        if ( !defined( $headers{"Content-Transfer-Encoding"} ) ) {
+            print STDERR "Info: Encoding of ", ordinal($fileind),
+              " file not found - leaving as-is.\n";
+        }
+        elsif ( $headers{"Content-Transfer-Encoding"} =~ /\bbase64\b/i ) {
+            $data = MIME::Base64::decode($data);
+        }
+        elsif (
+            $headers{"Content-Transfer-Encoding"} =~ /\bquoted-printable\b/i )
+        {
+            $data = MIME::QuotedPrint::decode($data);
+        }
+        debug_msg("origname=$origname; fname=$fname; type=$type");
+        debug_msg( "Content-Type: " . $headers{"Content-Type"} );
+        debug_msg( "Data size: " . length($data) );
+        if ( $opt{list} ) {
+            $origname =~ s/\s+$// if defined $origname;
+            my $size = length($data);
+            print $fname // "(?)", "\t", $type || "(?)", "\t$size\t",
+              $headers{"Content-Location"} // "(?)", "\n";
+            next;
+        }
+        $headers{fname} = $fname;
+        if ( $headers{"Content-Location"} ) {
+            $headers{url} = $headers{"Content-Location"};
+            $headers{url} =~ s/\s+$//;
+            $by_url{ $headers{url} } = \%headers;
+            debug_msg("Content-Location: $headers{url}");
+        }
+        else {
+            debug_msg("?? no location");
+        }
+        if ( $type eq "text/html" ) {
+            $headers{data} = $data;
+            if ( scalar @htmlfiles == 0 ) {   # first html file must have a name
+                if ( !$headers{fname} ) {
+                    $headers{fname} = "index.html";
+                }
+            }
+            push @htmlfiles, \%headers;
+            debug_msg("New html file in list: $headers{fname}");
+        }
+        else {
+            mkfiledir( $opt{output}, $htmlfiles[0]->{fname}, 1, \%config );
+            $fname = shorten_path "$config{filespath}/$fname";
+            open $fh, ">$fname" or abort "Could not create file $fname.";
+            print $fh $data;
+            close $fh;
+        }
+    }
+    continue {
+        debug_msg("");
+        ++$fileind;
+    }
+}
+
+if ( $opt{list} ) {
+    exit(0);
+}
+
+mkfiledir( $opt{output}, $htmlfiles[0]->{fname}, 0, \%config );
+
+my $entrypoint  = ".";
+my $filesprefix = $config{filesdir} . "/";
+my $outname     = $config{firstout};
+print "primary html output name: $outname\n";
+
+for my $html (@htmlfiles) {
+    my $linksubst = "";
+    my $p         = HTML::PullParser->new(
+        doc     => \$html->{data},
+        "start" => 'text, attr, tagname',
+        "text"  => 'text',
+        "end"   => 'text'
+    );
+    while ( defined( my $tok = $p->get_token() ) ) {
+        my $linkary;
+        my @linkattrs;
+        if (   ref( $tok->[1] )
+            && ( $linkary   = $HTML::Tagset::linkElements{ $tok->[2] } )
+            && ( @linkattrs = grep $tok->[1]->{$_}, @$linkary ) )
+        {
+            for my $attr (@linkattrs) {
+                my $uri = URI->new( $tok->[1]->{$attr} );
+                next unless defined $uri;
+                next unless defined $html->{url};
+                $uri = $uri->abs( $html->{url} );
+                $tok->[1]->{$attr} =
+                  "$filesprefix" . $by_url{ $uri->as_string() }->{fname}
+                  if $by_url{ $uri->as_string() };
+            }
+            delete $tok->[1]->{"/"};
+            $linksubst .=
+              "<$tok->[2] "
+              . join( " ",
+                map( "$_=\"$tok->[1]->{$_}\"", keys %{ $tok->[1] } ) )
+              . ">";
+        }
+        else {
+            $linksubst .= $tok->[0];
+        }
+    }
+    $outname = "$config{filespath}/$html->{fname}" unless defined $outname;
+    $outname = shorten_path $outname;
+    open $fh, ">$outname" or abort "Could not create file $outname.";
+    print $fh $linksubst;
+    close $fh;
+    $entrypoint = $outname if ( $entrypoint eq "." );
+
+    # for all except the first HTML file:
+    $filesprefix = "";
+    $outname     = undef;
+}
+
+if ( $tempdir ne "" ) {
+    chdir $tempdir;
+    my $enforce = "";
+    $enforce = "-force-html"
+      if (  $entrypoint ne "."
+        and $entrypoint !~ /\.htm(l)?$/
+        and $opt{exec}  =~ /(lynx|((e)?links(2)?))/ );
+    system( sprintf( "%s %s %s", $opt{exec}, $enforce, quoted $entrypoint ) );
+    chdir $origin;
+}
+
+1;