summaryrefslogtreecommitdiffstats
path: root/src/boost/tools/inspect/link_check.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/boost/tools/inspect/link_check.cpp487
1 files changed, 487 insertions, 0 deletions
diff --git a/src/boost/tools/inspect/link_check.cpp b/src/boost/tools/inspect/link_check.cpp
new file mode 100644
index 00000000..182af3bf
--- /dev/null
+++ b/src/boost/tools/inspect/link_check.cpp
@@ -0,0 +1,487 @@
+// link_check implementation -----------------------------------------------//
+
+// Copyright Beman Dawes 2002.
+//
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#include "link_check.hpp"
+#include "boost/regex.hpp"
+#include "boost/filesystem/operations.hpp"
+#include <boost/algorithm/string/case_conv.hpp>
+#include <cstdlib>
+#include <set>
+
+// #include <iostream>
+
+namespace fs = boost::filesystem;
+
+namespace
+{
+ boost::regex html_bookmark_regex(
+ "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3"
+ "|<!--.*?-->",
+ boost::regbase::normal | boost::regbase::icase);
+ boost::regex html_url_regex(
+ "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC
+ "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2"
+ "|<!--.*?-->",
+ boost::regbase::normal | boost::regbase::icase);
+ boost::regex css_url_regex(
+ "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)"
+ "|/\\*.*?\\*/",
+ boost::regbase::normal | boost::regbase::icase);
+
+ // Regular expression for parsing URLS from:
+ // http://tools.ietf.org/html/rfc3986#appendix-B
+ boost::regex url_decompose_regex(
+ "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$",
+ boost::regbase::normal);
+
+ typedef std::set<std::string> bookmark_set;
+ bookmark_set bookmarks;
+ bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive
+
+ // Decode html escapsed ampersands, returns an empty string if there's an error.
+ std::string decode_ampersands(std::string const& url_path) {
+ std::string::size_type pos = 0, next;
+ std::string result;
+ result.reserve(url_path.length());
+
+ while((next = url_path.find('&', pos)) != std::string::npos) {
+ result.append(url_path, pos, next - pos);
+ pos = next;
+ if(url_path.substr(pos, 5) == "&amp;") {
+ result += '&'; pos += 5;
+ }
+ else {
+ result += '&'; pos += 1;
+ }
+ break;
+ }
+
+ result.append(url_path, pos, url_path.length());
+
+ return result;
+ }
+
+ // Decode percent encoded characters, returns an empty string if there's an error.
+ std::string decode_percents(std::string const& url_path) {
+ std::string::size_type pos = 0, next;
+ std::string result;
+ result.reserve(url_path.length());
+
+ while((next = url_path.find('%', pos)) != std::string::npos) {
+ result.append(url_path, pos, next - pos);
+ pos = next;
+ switch(url_path[pos]) {
+ case '%': {
+ if(url_path.length() - next < 3) return "";
+ char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' };
+ char* end_ptr;
+ result += (char) std::strtol(hex, &end_ptr, 16);
+ if(*end_ptr) return "";
+ pos = next + 3;
+ break;
+ }
+ }
+ }
+
+ result.append(url_path, pos, url_path.length());
+
+ return result;
+ }
+
+ bool is_css(const path & p) {
+ return p.extension() == ".css";
+ }
+
+} // unnamed namespace
+
+namespace boost
+{
+ namespace inspect
+ {
+
+// link_check constructor --------------------------------------------------//
+
+ link_check::link_check()
+ : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
+ m_bookmark_errors(0), m_duplicate_bookmark_errors(0)
+ {
+ // HTML signatures are already registered by the base class,
+ // 'hypertext_inspector'
+ register_signature(".css");
+ }
+
+// inspect (all) -----------------------------------------------------------//
+
+ void link_check::inspect(
+ const string & /*library_name*/,
+ const path & full_path )
+ {
+ // keep track of paths already encountered to reduce disk activity
+ if ( !fs::is_directory( full_path ) )
+ m_paths[ relative_to( full_path, search_root_path() ) ] |= m_present;
+ }
+
+// inspect ( .htm, .html, .shtml, .css ) -----------------------------------//
+
+ void link_check::inspect(
+ const string & library_name,
+ const path & full_path, // example: c:/foo/boost/filesystem/path.hpp
+ const string & contents ) // contents of file to be inspected
+ {
+ if (contents.find( "boostinspect:" "nounlinked" ) != string::npos)
+ m_paths[ relative_to( full_path, search_root_path() ) ] |= m_nounlinked_errors;
+
+ bool no_link_errors =
+ (contents.find( "boostinspect:" "nolink" ) != string::npos);
+
+ // build bookmarks databases
+ bookmarks.clear();
+ bookmarks_lowercase.clear();
+ string::const_iterator a_start( contents.begin() );
+ string::const_iterator a_end( contents.end() );
+ boost::match_results< string::const_iterator > a_what;
+ boost::match_flag_type a_flags = boost::match_default;
+
+ if(!is_css(full_path))
+ {
+ string previous_id;
+
+ while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) )
+ {
+ // a_what[0] contains the whole string iterators.
+ // a_what[1] contains the tag iterators.
+ // a_what[2] contains the attribute name.
+ // a_what[4] contains the bookmark iterators.
+
+ if (a_what[4].matched)
+ {
+ string tag( a_what[1].first, a_what[1].second );
+ boost::algorithm::to_lower(tag);
+ string attribute( a_what[2].first, a_what[2].second );
+ boost::algorithm::to_lower(attribute);
+ string bookmark( a_what[4].first, a_what[4].second );
+
+ bool name_following_id = ( attribute == "name" && previous_id == bookmark );
+ if ( tag != "meta" && attribute == "id" ) previous_id = bookmark;
+ else previous_id.clear();
+
+ if ( tag != "meta" && !name_following_id )
+ {
+ bookmarks.insert( bookmark );
+// std::cout << "******************* " << bookmark << '\n';
+
+ // w3.org recommends case-insensitive checking for duplicate bookmarks
+ // since some browsers do a case-insensitive match.
+ string bookmark_lowercase( bookmark );
+ boost::algorithm::to_lower(bookmark_lowercase);
+
+ std::pair<bookmark_set::iterator, bool> result
+ = bookmarks_lowercase.insert( bookmark_lowercase );
+ if (!result.second)
+ {
+ ++m_duplicate_bookmark_errors;
+ int ln = std::count( contents.begin(), a_what[3].first, '\n' ) + 1;
+ error( library_name, full_path, "Duplicate bookmark: " + bookmark, ln );
+ }
+ }
+ }
+
+ a_start = a_what[0].second; // update search position
+ a_flags |= boost::match_prev_avail; // update flags
+ a_flags |= boost::match_not_bob;
+ }
+ }
+
+ // process urls
+ string::const_iterator start( contents.begin() );
+ string::const_iterator end( contents.end() );
+ boost::match_results< string::const_iterator > what;
+ boost::match_flag_type flags = boost::match_default;
+
+ if(!is_css(full_path))
+ {
+ while( boost::regex_search( start, end, what, html_url_regex, flags) )
+ {
+ // what[0] contains the whole string iterators.
+ // what[1] contains the element type iterators.
+ // what[3] contains the URL iterators.
+
+ if(what[3].matched)
+ {
+ string type( what[1].first, what[1].second );
+ boost::algorithm::to_lower(type);
+
+ // TODO: Complain if 'link' tags use external stylesheets.
+ do_url( string( what[3].first, what[3].second ),
+ library_name, full_path, no_link_errors,
+ type == "a" || type == "link", contents.begin(), what[3].first );
+ }
+
+ start = what[0].second; // update search position
+ flags |= boost::match_prev_avail; // update flags
+ flags |= boost::match_not_bob;
+ }
+ }
+
+ while( boost::regex_search( start, end, what, css_url_regex, flags) )
+ {
+ // what[0] contains the whole string iterators.
+ // what[2] contains the URL iterators.
+
+ if(what[2].matched)
+ {
+ do_url( string( what[2].first, what[2].second ),
+ library_name, full_path, no_link_errors, false,
+ contents.begin(), what[3].first );
+ }
+
+ start = what[0].second; // update search position
+ flags |= boost::match_prev_avail; // update flags
+ flags |= boost::match_not_bob;
+ }
+ }
+
+// do_url ------------------------------------------------------------------//
+
+ void link_check::do_url( const string & url, const string & library_name,
+ const path & source_path, bool no_link_errors, bool allow_external_content,
+ std::string::const_iterator contents_begin, std::string::const_iterator url_start )
+ // precondition: source_path.is_complete()
+ {
+ if(!no_link_errors && url.empty()) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "Empty URL.", ln );
+ return;
+ }
+
+ // Decode ampersand encoded characters.
+ string decoded_url = is_css(source_path) ? url : decode_ampersands(url);
+ if(decoded_url.empty()) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid URL (invalid ampersand encodings): " + url, ln );
+ }
+ return;
+ }
+
+ boost::smatch m;
+ if(!boost::regex_match(decoded_url, m, url_decompose_regex)) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "Invalid URL: " + decoded_url, ln );
+ }
+ return;
+ }
+
+ bool scheme_matched = m[2].matched,
+ authority_matched = m[4].matched,
+ //query_matched = m[7].matched,
+ fragment_matched = m[9].matched;
+
+ std::string scheme(m[2]),
+ authority(m[4]),
+ url_path(m[5]),
+ //query(m[7]),
+ fragment(m[9]);
+
+ // Check for external content
+ if(!allow_external_content && (authority_matched || scheme_matched)) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "External content: " + decoded_url, ln );
+ }
+ }
+
+ // Protocol checks
+ if(scheme_matched) {
+ if(scheme == "http" || scheme == "https") {
+ // All http links should have a hostname. Generally if they don't
+ // it's by mistake. If they shouldn't, then a protocol isn't
+ // required.
+ if(!authority_matched) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "No hostname: " + decoded_url, ln );
+ }
+ }
+
+ return;
+ }
+ else if(scheme == "file") {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid URL (hardwired file): " + decoded_url, ln );
+ }
+ }
+ else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") {
+ if ( !no_link_errors && is_css(source_path) ) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid protocol for css: " + decoded_url, ln );
+ }
+ }
+ else {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln );
+ }
+ }
+
+ return;
+ }
+
+ // Hostname without protocol.
+ if(authority_matched) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid URL (hostname without protocol): " + decoded_url, ln );
+ }
+ }
+
+ // Check the fragment identifier
+ if ( fragment_matched ) {
+ if ( is_css(source_path) ) {
+ if ( !no_link_errors ) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Fragment link in CSS: " + decoded_url, ln );
+ }
+ }
+ else {
+ if ( !no_link_errors && fragment.find( '#' ) != string::npos )
+ {
+ ++m_bookmark_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln );
+ }
+ else if ( !no_link_errors && url_path.empty() && !fragment.empty()
+ // w3.org recommends case-sensitive broken bookmark checking
+ // since some browsers do a case-sensitive match.
+ && bookmarks.find(decode_percents(fragment)) == bookmarks.end() )
+ {
+ ++m_broken_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln );
+ }
+ }
+
+ // No more to do if it's just a fragment identifier
+ if(url_path.empty()) return;
+ }
+
+ // Detect characters banned by RFC2396:
+ if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
+ {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid character in URL: " + decoded_url, ln );
+ }
+
+ // Check that we actually have a path.
+ if(url_path.empty()) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid URL (empty path in relative url): " + decoded_url, ln );
+ }
+ }
+
+ // Decode percent encoded characters.
+ string decoded_path = decode_percents(url_path);
+ if(decoded_path.empty()) {
+ if(!no_link_errors) {
+ ++m_invalid_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path,
+ "Invalid URL (invalid character encodings): " + decoded_url, ln );
+ }
+ return;
+ }
+
+ // strip url of references to current dir
+ if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 );
+
+ // url is relative source_path.branch()
+ // convert to target_path, which is_complete()
+ path target_path;
+ try { target_path = source_path.branch_path() /= path( decoded_path ); }
+ catch ( const fs::filesystem_error & )
+ {
+ if(!no_link_errors) {
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ ++m_invalid_errors;
+ error( library_name, source_path,
+ "Invalid URL (error resolving path): " + decoded_url, ln );
+ }
+ return;
+ }
+
+ // create a m_paths entry if necessary
+ std::pair< const string, int > entry(
+ relative_to( target_path, search_root_path() ), 0 );
+ m_path_map::iterator itr( m_paths.find( entry.first ) );
+ if ( itr == m_paths.end() )
+ {
+ if ( fs::exists( target_path ) ) entry.second = m_present;
+ itr = m_paths.insert( entry ).first;
+ }
+
+ // itr now points to the m_paths entry
+ itr->second |= m_linked_to;
+
+ // if target isn't present, the link is broken
+ if ( !no_link_errors && (itr->second & m_present) == 0 )
+ {
+ ++m_broken_errors;
+ int ln = std::count( contents_begin, url_start, '\n' ) + 1;
+ error( library_name, source_path, "Broken link: " + decoded_url, ln );
+ }
+ }
+
+// close -------------------------------------------------------------------//
+
+ void link_check::close()
+ {
+ for ( m_path_map::const_iterator itr = m_paths.begin();
+ itr != m_paths.end(); ++itr )
+ {
+// std::clog << itr->first << " " << itr->second << "\n";
+ if ( (itr->second & m_linked_to) != m_linked_to
+ && (itr->second & m_nounlinked_errors) != m_nounlinked_errors
+ && (itr->first.rfind( ".html" ) == itr->first.size()-5
+ || itr->first.rfind( ".htm" ) == itr->first.size()-4
+ || itr->first.rfind( ".css" ) == itr->first.size()-4)
+ // because they may be redirectors, it is OK if these are unlinked:
+ && itr->first.rfind( "index.html" ) == string::npos
+ && itr->first.rfind( "index.htm" ) == string::npos )
+ {
+ ++m_unlinked_errors;
+ path full_path( search_root_path() / path(itr->first) );
+ error( impute_library( full_path ), full_path, "Unlinked file" );
+ }
+ }
+ }
+
+ } // namespace inspect
+} // namespace boost
+