#!/usr/bin/env perl use strict; use warnings; use WgetFeature qw(iri); use HTTPTest; # cf. http://en.wikipedia.org/wiki/Latin1 # http://en.wikipedia.org/wiki/ISO-8859-15 ############################################################################### # # mime : charset found in Content-Type HTTP MIME header # meta : charset found in Content-Type meta tag # # index.html mime + file = iso-8859-15 # p1_français.html meta + file = iso-8859-1, mime = utf-8 # p2_één.html meta + file = utf-8, mime =iso-8859-1 # p3_€€€.html meta + file = utf-8, mime = iso-8859-1 # p4_méér.html mime + file = utf-8 # my $ccedilla_l15 = "\xE7"; my $ccedilla_u8 = "\xC3\xA7"; my $eacute_l1 = "\xE9"; my $eacute_u8 = "\xC3\xA9"; my $eurosign_l15 = "\xA4"; my $eurosign_u8 = "\xE2\x82\xAC"; my $pageindex = < Main Page

Link to page 1 La seule page en français. Link to page 3 My tailor is rich.

EOF # specifying a wrong charset in http-equiv - it will be overridden by Content-Type HTTP header my $pagefrancais = < La seule page en français

Link to page 2 Die enkele nerderlangstalige pagina.

EOF my $pageeen = < Die enkele nederlandstalige pagina

Één is niet veel maar toch meer dan nul.
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
Méér

EOF my $pageeuro = < Euro page

My tailor isn't rich anymore.

EOF my $pagemeer = < Bekende supermarkt

Ik ben toch niet gek !

EOF my $page404 = < 404

Nop nop nop...

EOF # code, msg, headers, content my %urls = ( '/index.html' => { code => "200", msg => "Ok", headers => { "Content-type" => "text/html; charset=ISO-8859-15", }, content => $pageindex, }, '/robots.txt' => { code => "200", msg => "Ok", headers => { "Content-type" => "text/plain", }, content => "", }, '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded code => "200", msg => "Ok", headers => { # Content-Type header overrides http-equiv Content-Type "Content-type" => "text/html; charset=ISO-8859-15", }, content => $pagefrancais, }, '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded code => "200", msg => "Ok", request_headers => { "Referer" => qr|http://localhost:[0-9]+/p1_fran%C3%A7ais.html|, }, headers => { "Content-type" => "text/html; charset=UTF-8", }, content => $pageeen, }, '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded code => "200", msg => "Ok", headers => { "Content-type" => "text/plain; charset=ISO-8859-1", }, content => $pageeuro, }, '/p4_m%C3%A9%C3%A9r.html' => { code => "200", msg => "Ok", request_headers => { "Referer" => qr|http://localhost:[0-9]+/p2_%C3%A9%C3%A9n.html|, }, headers => { "Content-type" => "text/plain; charset=UTF-8", }, content => $pagemeer, }, ); my $cmdline = $WgetTest::WGETPATH . " --iri --trust-server-names --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/"; my $expected_error_code = 0; my %expected_downloaded_files = ( 'index.html' => { content => $pageindex, }, 'robots.txt' => { content => "", }, "p1_fran${ccedilla_u8}ais.html" => { content => $pagefrancais, }, "p2_${eacute_u8}${eacute_u8}n.html" => { content => $pageeen, }, "p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => { content => $pageeuro, }, "p4_m${eacute_u8}${eacute_u8}r.html" => { content => $pagemeer, }, ); ############################################################################### my $the_test = HTTPTest->new (input => \%urls, cmdline => $cmdline, errcode => $expected_error_code, output => \%expected_downloaded_files); exit $the_test->run(); # vim: et ts=4 sw=4