diff options
Diffstat (limited to '')
-rw-r--r-- | src/boost/tools/inspect/link_check.cpp | 487 |
1 files changed, 487 insertions, 0 deletions
diff --git a/src/boost/tools/inspect/link_check.cpp b/src/boost/tools/inspect/link_check.cpp new file mode 100644 index 00000000..182af3bf --- /dev/null +++ b/src/boost/tools/inspect/link_check.cpp @@ -0,0 +1,487 @@ +// link_check implementation -----------------------------------------------// + +// Copyright Beman Dawes 2002. +// +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include "link_check.hpp" +#include "boost/regex.hpp" +#include "boost/filesystem/operations.hpp" +#include <boost/algorithm/string/case_conv.hpp> +#include <cstdlib> +#include <set> + +// #include <iostream> + +namespace fs = boost::filesystem; + +namespace +{ + boost::regex html_bookmark_regex( + "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3" + "|<!--.*?-->", + boost::regbase::normal | boost::regbase::icase); + boost::regex html_url_regex( + "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC + "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2" + "|<!--.*?-->", + boost::regbase::normal | boost::regbase::icase); + boost::regex css_url_regex( + "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)" + "|/\\*.*?\\*/", + boost::regbase::normal | boost::regbase::icase); + + // Regular expression for parsing URLS from: + // http://tools.ietf.org/html/rfc3986#appendix-B + boost::regex url_decompose_regex( + "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$", + boost::regbase::normal); + + typedef std::set<std::string> bookmark_set; + bookmark_set bookmarks; + bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive + + // Decode html escapsed ampersands, returns an empty string if there's an error. + std::string decode_ampersands(std::string const& url_path) { + std::string::size_type pos = 0, next; + std::string result; + result.reserve(url_path.length()); + + while((next = url_path.find('&', pos)) != std::string::npos) { + result.append(url_path, pos, next - pos); + pos = next; + if(url_path.substr(pos, 5) == "&") { + result += '&'; pos += 5; + } + else { + result += '&'; pos += 1; + } + break; + } + + result.append(url_path, pos, url_path.length()); + + return result; + } + + // Decode percent encoded characters, returns an empty string if there's an error. + std::string decode_percents(std::string const& url_path) { + std::string::size_type pos = 0, next; + std::string result; + result.reserve(url_path.length()); + + while((next = url_path.find('%', pos)) != std::string::npos) { + result.append(url_path, pos, next - pos); + pos = next; + switch(url_path[pos]) { + case '%': { + if(url_path.length() - next < 3) return ""; + char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' }; + char* end_ptr; + result += (char) std::strtol(hex, &end_ptr, 16); + if(*end_ptr) return ""; + pos = next + 3; + break; + } + } + } + + result.append(url_path, pos, url_path.length()); + + return result; + } + + bool is_css(const path & p) { + return p.extension() == ".css"; + } + +} // unnamed namespace + +namespace boost +{ + namespace inspect + { + +// link_check constructor --------------------------------------------------// + + link_check::link_check() + : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0), + m_bookmark_errors(0), m_duplicate_bookmark_errors(0) + { + // HTML signatures are already registered by the base class, + // 'hypertext_inspector' + register_signature(".css"); + } + +// inspect (all) -----------------------------------------------------------// + + void link_check::inspect( + const string & /*library_name*/, + const path & full_path ) + { + // keep track of paths already encountered to reduce disk activity + if ( !fs::is_directory( full_path ) ) + m_paths[ relative_to( full_path, search_root_path() ) ] |= m_present; + } + +// inspect ( .htm, .html, .shtml, .css ) -----------------------------------// + + void link_check::inspect( + const string & library_name, + const path & full_path, // example: c:/foo/boost/filesystem/path.hpp + const string & contents ) // contents of file to be inspected + { + if (contents.find( "boostinspect:" "nounlinked" ) != string::npos) + m_paths[ relative_to( full_path, search_root_path() ) ] |= m_nounlinked_errors; + + bool no_link_errors = + (contents.find( "boostinspect:" "nolink" ) != string::npos); + + // build bookmarks databases + bookmarks.clear(); + bookmarks_lowercase.clear(); + string::const_iterator a_start( contents.begin() ); + string::const_iterator a_end( contents.end() ); + boost::match_results< string::const_iterator > a_what; + boost::match_flag_type a_flags = boost::match_default; + + if(!is_css(full_path)) + { + string previous_id; + + while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) ) + { + // a_what[0] contains the whole string iterators. + // a_what[1] contains the tag iterators. + // a_what[2] contains the attribute name. + // a_what[4] contains the bookmark iterators. + + if (a_what[4].matched) + { + string tag( a_what[1].first, a_what[1].second ); + boost::algorithm::to_lower(tag); + string attribute( a_what[2].first, a_what[2].second ); + boost::algorithm::to_lower(attribute); + string bookmark( a_what[4].first, a_what[4].second ); + + bool name_following_id = ( attribute == "name" && previous_id == bookmark ); + if ( tag != "meta" && attribute == "id" ) previous_id = bookmark; + else previous_id.clear(); + + if ( tag != "meta" && !name_following_id ) + { + bookmarks.insert( bookmark ); +// std::cout << "******************* " << bookmark << '\n'; + + // w3.org recommends case-insensitive checking for duplicate bookmarks + // since some browsers do a case-insensitive match. + string bookmark_lowercase( bookmark ); + boost::algorithm::to_lower(bookmark_lowercase); + + std::pair<bookmark_set::iterator, bool> result + = bookmarks_lowercase.insert( bookmark_lowercase ); + if (!result.second) + { + ++m_duplicate_bookmark_errors; + int ln = std::count( contents.begin(), a_what[3].first, '\n' ) + 1; + error( library_name, full_path, "Duplicate bookmark: " + bookmark, ln ); + } + } + } + + a_start = a_what[0].second; // update search position + a_flags |= boost::match_prev_avail; // update flags + a_flags |= boost::match_not_bob; + } + } + + // process urls + string::const_iterator start( contents.begin() ); + string::const_iterator end( contents.end() ); + boost::match_results< string::const_iterator > what; + boost::match_flag_type flags = boost::match_default; + + if(!is_css(full_path)) + { + while( boost::regex_search( start, end, what, html_url_regex, flags) ) + { + // what[0] contains the whole string iterators. + // what[1] contains the element type iterators. + // what[3] contains the URL iterators. + + if(what[3].matched) + { + string type( what[1].first, what[1].second ); + boost::algorithm::to_lower(type); + + // TODO: Complain if 'link' tags use external stylesheets. + do_url( string( what[3].first, what[3].second ), + library_name, full_path, no_link_errors, + type == "a" || type == "link", contents.begin(), what[3].first ); + } + + start = what[0].second; // update search position + flags |= boost::match_prev_avail; // update flags + flags |= boost::match_not_bob; + } + } + + while( boost::regex_search( start, end, what, css_url_regex, flags) ) + { + // what[0] contains the whole string iterators. + // what[2] contains the URL iterators. + + if(what[2].matched) + { + do_url( string( what[2].first, what[2].second ), + library_name, full_path, no_link_errors, false, + contents.begin(), what[3].first ); + } + + start = what[0].second; // update search position + flags |= boost::match_prev_avail; // update flags + flags |= boost::match_not_bob; + } + } + +// do_url ------------------------------------------------------------------// + + void link_check::do_url( const string & url, const string & library_name, + const path & source_path, bool no_link_errors, bool allow_external_content, + std::string::const_iterator contents_begin, std::string::const_iterator url_start ) + // precondition: source_path.is_complete() + { + if(!no_link_errors && url.empty()) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "Empty URL.", ln ); + return; + } + + // Decode ampersand encoded characters. + string decoded_url = is_css(source_path) ? url : decode_ampersands(url); + if(decoded_url.empty()) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid URL (invalid ampersand encodings): " + url, ln ); + } + return; + } + + boost::smatch m; + if(!boost::regex_match(decoded_url, m, url_decompose_regex)) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "Invalid URL: " + decoded_url, ln ); + } + return; + } + + bool scheme_matched = m[2].matched, + authority_matched = m[4].matched, + //query_matched = m[7].matched, + fragment_matched = m[9].matched; + + std::string scheme(m[2]), + authority(m[4]), + url_path(m[5]), + //query(m[7]), + fragment(m[9]); + + // Check for external content + if(!allow_external_content && (authority_matched || scheme_matched)) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "External content: " + decoded_url, ln ); + } + } + + // Protocol checks + if(scheme_matched) { + if(scheme == "http" || scheme == "https") { + // All http links should have a hostname. Generally if they don't + // it's by mistake. If they shouldn't, then a protocol isn't + // required. + if(!authority_matched) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "No hostname: " + decoded_url, ln ); + } + } + + return; + } + else if(scheme == "file") { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid URL (hardwired file): " + decoded_url, ln ); + } + } + else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") { + if ( !no_link_errors && is_css(source_path) ) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid protocol for css: " + decoded_url, ln ); + } + } + else { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln ); + } + } + + return; + } + + // Hostname without protocol. + if(authority_matched) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid URL (hostname without protocol): " + decoded_url, ln ); + } + } + + // Check the fragment identifier + if ( fragment_matched ) { + if ( is_css(source_path) ) { + if ( !no_link_errors ) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Fragment link in CSS: " + decoded_url, ln ); + } + } + else { + if ( !no_link_errors && fragment.find( '#' ) != string::npos ) + { + ++m_bookmark_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln ); + } + else if ( !no_link_errors && url_path.empty() && !fragment.empty() + // w3.org recommends case-sensitive broken bookmark checking + // since some browsers do a case-sensitive match. + && bookmarks.find(decode_percents(fragment)) == bookmarks.end() ) + { + ++m_broken_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln ); + } + } + + // No more to do if it's just a fragment identifier + if(url_path.empty()) return; + } + + // Detect characters banned by RFC2396: + if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos ) + { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid character in URL: " + decoded_url, ln ); + } + + // Check that we actually have a path. + if(url_path.empty()) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid URL (empty path in relative url): " + decoded_url, ln ); + } + } + + // Decode percent encoded characters. + string decoded_path = decode_percents(url_path); + if(decoded_path.empty()) { + if(!no_link_errors) { + ++m_invalid_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, + "Invalid URL (invalid character encodings): " + decoded_url, ln ); + } + return; + } + + // strip url of references to current dir + if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 ); + + // url is relative source_path.branch() + // convert to target_path, which is_complete() + path target_path; + try { target_path = source_path.branch_path() /= path( decoded_path ); } + catch ( const fs::filesystem_error & ) + { + if(!no_link_errors) { + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + ++m_invalid_errors; + error( library_name, source_path, + "Invalid URL (error resolving path): " + decoded_url, ln ); + } + return; + } + + // create a m_paths entry if necessary + std::pair< const string, int > entry( + relative_to( target_path, search_root_path() ), 0 ); + m_path_map::iterator itr( m_paths.find( entry.first ) ); + if ( itr == m_paths.end() ) + { + if ( fs::exists( target_path ) ) entry.second = m_present; + itr = m_paths.insert( entry ).first; + } + + // itr now points to the m_paths entry + itr->second |= m_linked_to; + + // if target isn't present, the link is broken + if ( !no_link_errors && (itr->second & m_present) == 0 ) + { + ++m_broken_errors; + int ln = std::count( contents_begin, url_start, '\n' ) + 1; + error( library_name, source_path, "Broken link: " + decoded_url, ln ); + } + } + +// close -------------------------------------------------------------------// + + void link_check::close() + { + for ( m_path_map::const_iterator itr = m_paths.begin(); + itr != m_paths.end(); ++itr ) + { +// std::clog << itr->first << " " << itr->second << "\n"; + if ( (itr->second & m_linked_to) != m_linked_to + && (itr->second & m_nounlinked_errors) != m_nounlinked_errors + && (itr->first.rfind( ".html" ) == itr->first.size()-5 + || itr->first.rfind( ".htm" ) == itr->first.size()-4 + || itr->first.rfind( ".css" ) == itr->first.size()-4) + // because they may be redirectors, it is OK if these are unlinked: + && itr->first.rfind( "index.html" ) == string::npos + && itr->first.rfind( "index.htm" ) == string::npos ) + { + ++m_unlinked_errors; + path full_path( search_root_path() / path(itr->first) ); + error( impute_library( full_path ), full_path, "Unlinked file" ); + } + } + } + + } // namespace inspect +} // namespace boost + |