Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/boost_1_33_1/tools/inspect/link_check.cpp @ 25

Last change on this file since 25 was 12, checked in by landauf, 18 years ago
added boost
File size: 5.6 KB

Rev	Line
[12]	1	// link_check implementation -----------------------------------------------//
	2
	3	// Copyright Beman Dawes 2002.
	4	// Distributed under the Boost Software License, Version 1.0.
	5	// (See accompanying file LICENSE_1_0.txt or copy at
	6	// http://www.boost.org/LICENSE_1_0.txt)
	7
	8	#include "link_check.hpp"
	9	#include <boost/regex.hpp>
	10	#include <boost/filesystem/operations.hpp>
	11	#include <boost/filesystem/exception.hpp>
	12
	13	namespace fs = boost::filesystem;
	14
	15	namespace
	16	{
	17	boost::regex url_regex(
	18	"<\\s[^>]\\s+(?:HREF\|SRC)" // HREF or SRC
	19	"\\s=\\s\"([^\"]*)\"",
	20	boost::regbase::normal \| boost::regbase::icase);
	21
	22	} // unnamed namespace
	23
	24	namespace boost
	25	{
	26	namespace inspect
	27	{
	28
	29	// link_check constructor --------------------------------------------------//
	30
	31	link_check::link_check()
	32	: m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
	33	m_bookmark_errors(0)
	34	{
	35	}
	36
	37	// inspect (all) -----------------------------------------------------------//
	38
	39	void link_check::inspect(
	40	const string & library_name,
	41	const path & full_path )
	42	{
	43	// keep track of paths already encountered to reduce disk activity
	44	if ( !fs::is_directory( full_path ) )
	45	m_paths[ relative_to( full_path, fs::initial_path() ) ] \|= m_present;
	46	}
	47
	48	// inspect ( .htm, .html ) -------------------------------------------------//
	49
	50	void link_check::inspect(
	51	const string & library_name,
	52	const path & full_path, // example: c:/foo/boost/filesystem/path.hpp
	53	const string & contents ) // contents of file to be inspected
	54	{
	55	string::const_iterator start( contents.begin() );
	56	string::const_iterator end( contents.end() );
	57	boost::match_results< string::const_iterator > what;
	58	boost::match_flag_type flags = boost::match_default;
	59
	60	while( boost::regex_search( start, end, what, url_regex, flags) )
	61	{
	62	// what[0] contains the whole string iterators.
	63	// what[1] contains the URL iterators.
	64	do_url( string( what[1].first, what[1].second ),
	65	library_name, full_path );
	66
	67	start = what[0].second; // update search position
	68	flags \|= boost::match_prev_avail; // update flags
	69	flags \|= boost::match_not_bob;
	70	}
	71	}
	72
	73	// do_url ------------------------------------------------------------------//
	74
	75	void link_check::do_url( const string & url, const string & library_name,
	76	const path & source_path ) // precondition: source_path.is_complete()
	77	{
	78	if ( url[0] == '#'
	79	\|\| url.find( "mailto:" ) == 0
	80	\|\| url.find( "http:" ) == 0
	81	\|\| url.find( "https:" ) == 0
	82	\|\| url.find( "ftp:" ) == 0
	83	\|\| url.find( "news:" ) == 0
	84	\|\| url.find( "javascript:" ) == 0
	85	) return;
	86
	87	if ( url.find( "file:" ) == 0 )
	88	{
	89	++m_invalid_errors;
	90	error( library_name, source_path, "invalid URL (hardwired file): " + url );
	91	return;
	92	}
	93
	94	// detect characters banned by RFC2396:
	95	if ( url.find_first_of( " <>\"{}\|\\^[]'" ) != string::npos )
	96	{
	97	++m_invalid_errors;
	98	error( library_name, source_path, "invalid character in URL: " + url );
	99	}
	100
	101	// strip url of bookmarks
	102	string plain_url( url );
	103	string::size_type pos( plain_url.find( '#' ) );
	104	if ( pos != string::npos )
	105	{
	106	plain_url.erase( pos );
	107	// detect characters banned by RFC2396 in bookmark:
	108	if ( url.find( '#', pos+1 ) != string::npos )
	109	{
	110	++m_bookmark_errors;
	111	error( library_name, source_path, "invalid bookmark: " + url );
	112	}
	113	}
	114
	115	// strip url of references to current dir
	116	if ( plain_url[0]=='.' && plain_url[1]=='/' ) plain_url.erase( 0, 2 );
	117
	118	// url is relative source_path.branch()
	119	// convert to target_path, which is_complete()
	120	path target_path;
	121	try { target_path = source_path.branch_path() /= path( plain_url, fs::no_check ); }
	122	catch ( const fs::filesystem_error & )
	123	{
	124	++m_invalid_errors;
	125	error( library_name, source_path, "invalid URL: " + url );
	126	return;
	127	}
	128
	129	// create a m_paths entry if necessary
	130	std::pair< const string, int > entry(
	131	relative_to( target_path, fs::initial_path() ), 0 );
	132	m_path_map::iterator itr( m_paths.find( entry.first ) );
	133	if ( itr == m_paths.end() )
	134	{
	135	if ( fs::exists( target_path ) ) entry.second = m_present;
	136	itr = m_paths.insert( entry ).first;
	137	}
	138
	139	// itr now points to the m_paths entry
	140	itr->second \|= m_linked_to;
	141
	142	// if target isn't present, the link is broken
	143	if ( (itr->second & m_present) == 0 )
	144	{
	145	++m_broken_errors;
	146	error( library_name, source_path, "broken link: " + url );
	147	}
	148	}
	149
	150	// close -------------------------------------------------------------------//
	151
	152	void link_check::close()
	153	{
	154	for ( m_path_map::const_iterator itr = m_paths.begin();
	155	itr != m_paths.end(); ++itr )
	156	{
	157	// std::clog << itr->first << " " << itr->second << "\n";
	158	if ( (itr->second & m_linked_to) != m_linked_to
	159	&& (itr->first.rfind( ".html" ) == itr->first.size()-5
	160	\|\| itr->first.rfind( ".htm" ) == itr->first.size()-4)
	161	// because they may be redirectors, it is OK if these are unlinked:
	162	&& itr->first.rfind( "index.html" ) == string::npos
	163	&& itr->first.rfind( "index.htm" ) == string::npos )
	164	{
	165	++m_unlinked_errors;
	166	path full_path( fs::initial_path() / path(itr->first, fs::no_check) );
	167	error( impute_library( full_path ), full_path, "unlinked file" );
	168	}
	169	}
	170	}
	171
	172	} // namespace inspect
	173	} // namespace boost
	174

Note: See TracBrowser for help on using the repository browser.

Download in other formats: