diff options
Diffstat (limited to '')
-rwxr-xr-x | samples/mhtml-tool | 568 |
1 files changed, 568 insertions, 0 deletions
diff --git a/samples/mhtml-tool b/samples/mhtml-tool new file mode 100755 index 0000000..aaf7f70 --- /dev/null +++ b/samples/mhtml-tool @@ -0,0 +1,568 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +$| = 1; + +=pod + +=head1 NAME + +B<mhtml-tool> - Unpack a MIME HTML archive + + +=head1 SYNOPSIS + +B<mhtml-tool> unpacks MIME HTML archives that some browsers (such as Opera) +save by default. The file extensions of such archives are ".mht" or ".mhtml". + +The first HTML file in the archive is taken to be the primary web page, the +other contained files for "page requisites" such as images or frames. The +primary web page is written to the output directory (the current directory by +default), the requisites to a subdirectory named after the primary HTML file +name without extension, with "_files" appended. Link URLs in all HTML +files referring to requisites are rewritten to point to the saved files. + + +=head1 OPTIONS + +=over + +=item B<-e> program + +=item B<--exec> program + +Unpack the archive into a temporary directory and execute the given program +in that directory, passing the primary HTML (first text/html entry) of the +archive as a parameter. + +If B<-o> is given, use that directory location with the program. + +If the program is "lynx" or "(e)links(2)", this script uses the "-force-html" +option to help when the entrypoint lacks a ".html" file-suffix. + +=item B<-h> + +=item B<-?> + +=item B<--help> + +Print a brief usage summary. + +=item B<-l> + +=item B<--list> + +List archive contents instead of unpacking. Four columns are output: file +name, MIME type, size and URL. Unavailable entries are replaced by "(?)". + +=item B<-o> I<directory/ or name> + +=item B<--output> I<directory/ or name> + +If the argument ends in a slash or is an existing directory, unpack to that +directory instead of current directory. Otherwise the argument is taken as a +path to the file name to write the primary HTML file to. If the output +directory does not exist, it is created. + +=back + + +=head1 SEE ALSO + +https://github.com/ThomasDickey/mhtml-tool + +https://github.com/clapautius/mhtml-tool + +http://www.volkerschatz.com/unix/uware/unmht.html + +=head1 COPYLEFT + +B<unmht> is Copyright (c) 2012 Volker Schatz. It may be copied and/or +modified under the same terms as Perl. + +B<mhtml-tool> is Copyright (c) 2018 Tudor M. Pristavu. It may be copied and/or +modified under the same terms as Perl. + +B<mhtml-tool> is Copyright (c) 2024 Thomas E. Dickey. It may be copied and/or +modified under the same terms as Perl. + +=cut + +use Cwd; +use File::Path; +use File::Copy; +use File::Glob; +use File::Temp qw/ tempdir /; +use URI; +use MIME::Base64; +use MIME::QuotedPrint; +use HTML::PullParser; +use HTML::Tagset; +use Getopt::Long; + +our $path_limit = 255; +our %shorten_path; + +# RFC 7230 (obsoletes RFC 2616) indicates that URI length is limited by the +# server line-length, which it suggests should be at least 8000 octets. The +# problem is that components of pathnames might be limited to 255 bytes, while +# the URI components can be much longer. As an additional complication, the +# too-long components can be both directory- and file-names, and truncating +# the components can produce collisions. +sub shorten_path { + my $actual = $_[0]; + if ( not defined $shorten_path{$actual} ) { + my @parts = split /\//, $actual; + for my $n ( 0 .. $#parts ) { + $parts[$n] = substr( $parts[$n], 0, $path_limit ); + } + $shorten_path{$actual} = join "/", @parts; + } + return $shorten_path{$actual}; +} + +# URLs can include "&" and other characters which require quoting to use in +# a shell command. We use system() rather than exec() to allow the temporary +# directory to be cleaned up after running lynx. +sub quoted { + my $param = $_[0]; + $param =~ s/'/'\"'\"'/g; + return sprintf( "'%s'", $param ); +} + +# Add approriate ordinal suffix to a number. +# -> Number +# <- String of number with ordinal suffix +sub ordinal { + return $_[0] . "th" if $_[0] > 3 && $_[0] < 20; + my $unitdig = $_[0] % 10; + return $_[0] . "st" if $unitdig == 1; + return $_[0] . "nd" if $unitdig == 2; + return $_[0] . "rd" if $unitdig == 3; + return $_[0] . "th"; +} + +{ + my %taken; + + # Find unique file name. + # -> Preferred file name, or undef + # MHT archive name (as a fallback if no name given) + # <- File name not conflicting with names returned by previous calls, but which + # may exist! + sub unique_name { + my ( $fname, $mhtname ) = @_; + my ( $trunc, $ext ); + + if ( defined $fname ) { + $fname =~ s/^\s+//; + $fname =~ s/\s+$//; + $taken{$fname} = 1, return $fname unless $taken{$fname}; + ( $trunc, $ext ) = $fname =~ /^(.*?)(\.\w+)?$/; + $ext //= ""; + } + else { + $trunc = $mhtname || "unpack"; + $trunc =~ s/\.mht(?:ml?)?$//i; + $ext = ""; + } + for my $suff ( 1 .. 9999 ) { + $fname = "${trunc}_$suff$ext"; + $taken{$fname} = 1, return $fname unless $taken{$fname}; + ++$suff; + } + return undef; + } + +} + +# Output error message and exit with return value 1. +sub abort { + print STDERR "$_[0]\n"; + exit 1; +} + +# Generate output file directories and primary HTML file name depending on +# --output option and primary HTML file name from archive. In case the primary +# HTML file is not the first file in the archive, the secondary files directory +# is renamed or the files moved on the second call, when the primary HTML file +# name is known. +# -> Value of --output option (or undef) +# Primary HTML file name from MHT archive +# Flag indicating if .._files subdirectory should be created +# Hash reference to store resulting paths to +sub mkfiledir { + my ( $outputopt, $firsthtmlname, $needfilesdir, $out ) = @_; + + my $prevfilespath = $$out{filespath}; + $firsthtmlname = "unpackmht-$$" unless defined $firsthtmlname; + if ( !defined $outputopt ) { + $$out{toppath} = "."; + } + elsif ( -d $outputopt || $outputopt =~ m!/$! ) { + $$out{toppath} = $outputopt; + } + else { + ( $$out{toppath}, $firsthtmlname ) = $outputopt =~ m!^(.*/)?([^/]+)$!; + abort "Empty output file name." unless defined $firsthtmlname; + $firsthtmlname .= ".html" unless $firsthtmlname =~ /\./; + $$out{toppath} = "." unless defined $$out{toppath}; + } + $$out{toppath} =~ s!/$!!; + $$out{firstout} = "$$out{toppath}/$firsthtmlname"; + $$out{filesdir} = $firsthtmlname; + $$out{filesdir} =~ s/\.[^.]+$//; + $$out{filesdir} = substr( $$out{filesdir}, 0, $path_limit - 6 ); + $$out{filesdir} .= "_files"; + $$out{filespath} = "$$out{toppath}/$$out{filesdir}"; + + if ( defined $prevfilespath ) { + return unless $prevfilespath ne $$out{filespath}; + return unless -d $prevfilespath; + if ( !-d $$out{filespath} ) { + File::Copy::move( $prevfilespath, $$out{filespath} ) + or abort "Could not rename secondary files directory."; + } + else { + for ( File::Glob::bsd_glob("$prevfilespath/*") ) { + File::Copy::move( $_, $$out{filespath} ) + or abort "Could not move secondary files."; + } + } + } + else { + my $createall = $needfilesdir ? $$out{filespath} : $$out{toppath}; + if ( !-d $createall ) { + File::Path::make_path($createall) + or abort "Could not create output directory $createall."; + } + } +} + +my %opt; +my @optdescr = ( 'exec|e=s', 'output|o=s', 'list|l!', 'help|h|?!', 'debug|d!' ); +my %config; + +my $status = GetOptions( \%opt, @optdescr ); + +if ( !$status || $opt{help} ) { + print <<EOF; +Usage: mhtml-tool [options] <MHT file> + +By default, mhtml-tool unpacks an MHT archive (an archive type saved by some +browsers) to the current directory. The first HTML file in the archive is +taken for the primary web page, and all other contained files are written to a +directory named after that HTML file. + +Options: +-e, --exec Execute the given program on a temporarily-unpacked archive +-l, --list List archive contents (file name, MIME type, size and URL) +-o, --output Unpack to directory <dir/> or to file <name>.html + +Use the command "pod2man mhtml-tool > mhtml-tool.1" or +"pod2html mhtml-tool > mhtml-tool.html" to extract the manual. +EOF + exit !$status; +} + +my $origin = getcwd; +my $tempdir = ""; +if ( $opt{exec} and not defined $opt{output} ) { + $tempdir = tempdir( CLEANUP => 1 ); + $opt{output} = $tempdir . "/"; +} + +my $orig_sep = $/; +my $crlf_file = 0; + +# Print a message to stdout (if debug is enabled). +sub debug_msg { + if ( $opt{debug} ) { + print ":debug: $_[0]\n"; + } +} + +# Read next line. Remove line endings (both unix & windows style). +sub read_next_line { + my $cur_line = <>; + chomp $cur_line; + if ( $cur_line =~ /\r$/ ) { + debug_msg("It's a CRLF file") if ( $crlf_file == 0 ); + $crlf_file++; + } + $cur_line =~ s/\r//; + return $cur_line; +} + +# Parse headers. +# Handle multi-line headers (lines starting with spaces are part of a +# multi-line header). +# Read data from <> until the first empty line. +# Set $crlf_file to 1 if it's a CRLF (windoze) file. +sub parse_headers { + my %headers; + my $sep = $/; + $/ = $orig_sep; + my $cur_line = read_next_line; + my $full_line = $cur_line; + + while ( not( $cur_line =~ /^$/ ) ) { + if ( $cur_line =~ /;$/ ) { # if a continued line... + while ( ( $cur_line = read_next_line ) =~ /^\h+/ ) { + $cur_line =~ s/^\h+//; + $full_line = $full_line . " " . $cur_line; + } + if ( $full_line =~ s/^([-\w]+): (.*)$// ) { + $headers{$1} = $2; + debug_msg("(1) New header $1 with value $2"); + } + } + else { + if ( $full_line =~ s/^([-\w]+): (.*)$// ) { + $headers{$1} = $2; + debug_msg("(2) New header $1 with value $2"); + } + } + $cur_line = read_next_line; + $full_line = $cur_line; + } + $/ = $sep; + return %headers; +} + +my %global_headers = parse_headers(); + +abort "Can't find Content-Type header - not a MIME HTML file?" + if ( !defined( $global_headers{'Content-Type'} ) ); +debug_msg("Global Content-Type: $global_headers{'Content-Type'}"); + +my $boundary = ''; +my $endcode = ''; + +# FIXME: - add other possible mime types +# Types implemented so far are: +# Content-Type: multipart/related; boundary="----=_NextPart_01D8BCC9.47C2B260" +# Content-Type: text/html; charset="utf-8" +if ( $global_headers{'Content-Type'} =~ + m!multipart/related;.*\h+boundary="?([^"]*)"?$! ) +{ + $endcode = $boundary = $1; + $endcode =~ s/\s+$//; + if ($crlf_file) { + $/ = "\r\n--$boundary\r\n"; + } + else { + $/ = "\n--$boundary\n"; + } + debug_msg("Boundary: $boundary"); +} +elsif ( $global_headers{'Content-Type'} =~ m!text/html! ) { + $/ = ''; +} +else { + die "Error: Unknown Content-Type: $global_headers{'Content-Type'}\n"; +} + +my %by_url; +my @htmlfiles; +my $fh; + +{ + my $fileind = 1; + my $leading = "--$boundary\n"; + while ( defined( my $data = <> ) ) { + chomp $data; + $data =~ s/\R/\n/g; # handle various other line-endings (windows, mac) + while ( index( $data, $leading ) == 0 ) { + $data = substr( $data, length($leading) ); + } + my %headers; + while ( $data =~ s/^([-\w]+): (.*)\n// ) { + $headers{$1} = $2; + debug_msg("New header $1 with value $2"); + + # read (and ignore atm) lines starting with space (multi-line headers) + while ( $data =~ s/^\h+.*\n// ) { + debug_msg("empty line"); + } + } + if ( scalar(%headers) == 0 ) { + %headers = %global_headers + ; # fallback to the global headers as needed, usually for "text" + } + $data =~ s/^\n//; + $data =~ s/\n--$endcode--\r?\n$/\n/s; + my ( $type, $origname ); + if ( defined( $headers{"Content-Type"} ) ) { + ($type) = $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/; + debug_msg("type=$type"); + } + else { + print "Error: No Content-Type found, skipping chunk\n"; + + # we could print the bad chunk here... + # but it's probably just a "warning" about + # "if you see this your software isn't recognizing a web archive file" + next; + } + if ( defined( $headers{"Content-Type"} ) + && $headers{"Content-Type"} =~ /\bname=([^;]*)/ ) + { + $origname = $1; + ($type) = $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/; + $type //= ""; + } + elsif ( defined( $headers{"Content-Disposition"} ) + && $headers{"Content-Disposition"} =~ /\bfilename=([^;]*)/ ) + { + $origname = $1; + if ( !defined($type) ) { + $type = $origname =~ /\.html?$/i ? "text/html" : ""; + } + } + elsif ( defined( $headers{"Content-Location"} ) ) { + $origname = $headers{"Content-Location"}; + + # for unknown reasons, files generated by IE11 may contain some + # local paths with '\' instead of '/' + if ( $origname =~ m!^file://.*\\! ) { + debug_msg("Windows-style path detected: $origname"); + $origname =~ s!^.*\\!!; + } + $origname =~ s!^.*/!!; + if ( !defined($type) ) { + $type = ""; + } + } + + $origname = "noname" unless defined $origname; + $origname = "noname" if ( $origname eq "" ); + my $fname = unique_name( $origname, $ARGV[0] ); + if ( !defined( $headers{"Content-Transfer-Encoding"} ) ) { + print STDERR "Info: Encoding of ", ordinal($fileind), + " file not found - leaving as-is.\n"; + } + elsif ( $headers{"Content-Transfer-Encoding"} =~ /\bbase64\b/i ) { + $data = MIME::Base64::decode($data); + } + elsif ( + $headers{"Content-Transfer-Encoding"} =~ /\bquoted-printable\b/i ) + { + $data = MIME::QuotedPrint::decode($data); + } + debug_msg("origname=$origname; fname=$fname; type=$type"); + debug_msg( "Content-Type: " . $headers{"Content-Type"} ); + debug_msg( "Data size: " . length($data) ); + if ( $opt{list} ) { + $origname =~ s/\s+$// if defined $origname; + my $size = length($data); + print $fname // "(?)", "\t", $type || "(?)", "\t$size\t", + $headers{"Content-Location"} // "(?)", "\n"; + next; + } + $headers{fname} = $fname; + if ( $headers{"Content-Location"} ) { + $headers{url} = $headers{"Content-Location"}; + $headers{url} =~ s/\s+$//; + $by_url{ $headers{url} } = \%headers; + debug_msg("Content-Location: $headers{url}"); + } + else { + debug_msg("?? no location"); + } + if ( $type eq "text/html" ) { + $headers{data} = $data; + if ( scalar @htmlfiles == 0 ) { # first html file must have a name + if ( !$headers{fname} ) { + $headers{fname} = "index.html"; + } + } + push @htmlfiles, \%headers; + debug_msg("New html file in list: $headers{fname}"); + } + else { + mkfiledir( $opt{output}, $htmlfiles[0]->{fname}, 1, \%config ); + $fname = shorten_path "$config{filespath}/$fname"; + open $fh, ">$fname" or abort "Could not create file $fname."; + print $fh $data; + close $fh; + } + } + continue { + debug_msg(""); + ++$fileind; + } +} + +if ( $opt{list} ) { + exit(0); +} + +mkfiledir( $opt{output}, $htmlfiles[0]->{fname}, 0, \%config ); + +my $entrypoint = "."; +my $filesprefix = $config{filesdir} . "/"; +my $outname = $config{firstout}; +print "primary html output name: $outname\n"; + +for my $html (@htmlfiles) { + my $linksubst = ""; + my $p = HTML::PullParser->new( + doc => \$html->{data}, + "start" => 'text, attr, tagname', + "text" => 'text', + "end" => 'text' + ); + while ( defined( my $tok = $p->get_token() ) ) { + my $linkary; + my @linkattrs; + if ( ref( $tok->[1] ) + && ( $linkary = $HTML::Tagset::linkElements{ $tok->[2] } ) + && ( @linkattrs = grep $tok->[1]->{$_}, @$linkary ) ) + { + for my $attr (@linkattrs) { + my $uri = URI->new( $tok->[1]->{$attr} ); + next unless defined $uri; + next unless defined $html->{url}; + $uri = $uri->abs( $html->{url} ); + $tok->[1]->{$attr} = + "$filesprefix" . $by_url{ $uri->as_string() }->{fname} + if $by_url{ $uri->as_string() }; + } + delete $tok->[1]->{"/"}; + $linksubst .= + "<$tok->[2] " + . join( " ", + map( "$_=\"$tok->[1]->{$_}\"", keys %{ $tok->[1] } ) ) + . ">"; + } + else { + $linksubst .= $tok->[0]; + } + } + $outname = "$config{filespath}/$html->{fname}" unless defined $outname; + $outname = shorten_path $outname; + open $fh, ">$outname" or abort "Could not create file $outname."; + print $fh $linksubst; + close $fh; + $entrypoint = $outname if ( $entrypoint eq "." ); + + # for all except the first HTML file: + $filesprefix = ""; + $outname = undef; +} + +if ( $tempdir ne "" ) { + chdir $tempdir; + my $enforce = ""; + $enforce = "-force-html" + if ( $entrypoint ne "." + and $entrypoint !~ /\.htm(l)?$/ + and $opt{exec} =~ /(lynx|((e)?links(2)?))/ ); + system( sprintf( "%s %s %s", $opt{exec}, $enforce, quoted $entrypoint ) ); + chdir $origin; +} + +1; |