Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/boost_1_33_1/tools/inspect/link_check.cpp @ 25

Last change on this file since 25 was 12, checked in by landauf, 18 years ago

added boost

File size: 5.6 KB
RevLine 
[12]1//  link_check implementation  -----------------------------------------------//
2
3//  Copyright Beman Dawes 2002.
4//  Distributed under the Boost Software License, Version 1.0.
5//  (See accompanying file LICENSE_1_0.txt or copy at
6//  http://www.boost.org/LICENSE_1_0.txt)
7
8#include "link_check.hpp"
9#include <boost/regex.hpp>
10#include <boost/filesystem/operations.hpp>
11#include <boost/filesystem/exception.hpp>
12
13namespace fs = boost::filesystem;
14
15namespace
16{
17  boost::regex url_regex(
18    "<\\s*[^>]*\\s+(?:HREF|SRC)" // HREF or SRC
19    "\\s*=\\s*\"([^\"]*)\"",
20    boost::regbase::normal | boost::regbase::icase);
21
22} // unnamed namespace
23
24namespace boost
25{
26  namespace inspect
27  {
28
29//  link_check constructor  --------------------------------------------------//
30   
31   link_check::link_check()
32     : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
33       m_bookmark_errors(0)
34   {
35   }
36     
37//  inspect (all)  -----------------------------------------------------------//
38
39   void link_check::inspect(
40      const string & library_name,
41      const path & full_path )
42    {
43      // keep track of paths already encountered to reduce disk activity
44      if ( !fs::is_directory( full_path ) )
45        m_paths[ relative_to( full_path, fs::initial_path() ) ] |= m_present;
46    }
47
48//  inspect ( .htm, .html )  -------------------------------------------------//
49
50   void link_check::inspect(
51      const string & library_name,
52      const path & full_path,   // example: c:/foo/boost/filesystem/path.hpp
53      const string & contents )     // contents of file to be inspected
54    {
55      string::const_iterator start( contents.begin() );
56      string::const_iterator end( contents.end() );
57      boost::match_results< string::const_iterator > what; 
58      boost::match_flag_type flags = boost::match_default; 
59
60      while( boost::regex_search( start, end, what, url_regex, flags) ) 
61      { 
62        // what[0] contains the whole string iterators.
63        // what[1] contains the URL iterators.
64        do_url( string( what[1].first, what[1].second ),
65          library_name, full_path );
66
67        start = what[0].second; // update search position
68        flags |= boost::match_prev_avail; // update flags
69        flags |= boost::match_not_bob; 
70      } 
71    }
72
73//  do_url  ------------------------------------------------------------------//
74
75    void link_check::do_url( const string & url, const string & library_name,
76      const path & source_path ) // precondition: source_path.is_complete()
77    {
78      if ( url[0] == '#'
79        || url.find( "mailto:" ) == 0
80        || url.find( "http:" ) == 0
81        || url.find( "https:" ) == 0
82        || url.find( "ftp:" ) == 0
83        || url.find( "news:" ) == 0
84        || url.find( "javascript:" ) == 0
85        ) return;
86
87      if ( url.find( "file:" ) == 0 )
88      {
89        ++m_invalid_errors;
90        error( library_name, source_path, "invalid URL (hardwired file): " + url );
91        return;
92      }
93
94      // detect characters banned by RFC2396:
95      if ( url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
96      {
97        ++m_invalid_errors;
98        error( library_name, source_path, "invalid character in URL: " + url );
99      }
100     
101      // strip url of bookmarks
102      string plain_url( url );
103      string::size_type pos( plain_url.find( '#' ) );
104      if ( pos != string::npos )
105      {
106        plain_url.erase( pos );
107        // detect characters banned by RFC2396 in bookmark:
108        if ( url.find( '#', pos+1 ) != string::npos )
109        {
110          ++m_bookmark_errors;
111          error( library_name, source_path, "invalid bookmark: " + url );
112        }
113      }
114
115      // strip url of references to current dir
116      if ( plain_url[0]=='.' && plain_url[1]=='/' ) plain_url.erase( 0, 2 );
117
118      // url is relative source_path.branch()
119      // convert to target_path, which is_complete()
120      path target_path;
121      try { target_path = source_path.branch_path() /= path( plain_url, fs::no_check ); }
122      catch ( const fs::filesystem_error & )
123      {
124        ++m_invalid_errors;
125        error( library_name, source_path, "invalid URL: " + url );
126        return;
127      }
128
129      // create a m_paths entry if necessary
130      std::pair< const string, int > entry(
131        relative_to( target_path, fs::initial_path() ), 0 );
132      m_path_map::iterator itr( m_paths.find( entry.first ) );
133      if ( itr == m_paths.end() )
134      {
135        if ( fs::exists( target_path ) ) entry.second = m_present;
136        itr = m_paths.insert( entry ).first;
137      }
138
139      // itr now points to the m_paths entry
140      itr->second |= m_linked_to;
141
142      // if target isn't present, the link is broken
143      if ( (itr->second & m_present) == 0 )
144      {
145        ++m_broken_errors;
146        error( library_name, source_path, "broken link: " + url );
147      }
148    }
149
150//  close  -------------------------------------------------------------------//
151
152   void link_check::close()
153   {
154     for ( m_path_map::const_iterator itr = m_paths.begin();
155       itr != m_paths.end(); ++itr )
156     {
157// std::clog << itr->first << " " << itr->second << "\n";
158       if ( (itr->second & m_linked_to) != m_linked_to
159         && (itr->first.rfind( ".html" ) == itr->first.size()-5
160          || itr->first.rfind( ".htm" ) == itr->first.size()-4)
161         // because they may be redirectors, it is OK if these are unlinked:
162         && itr->first.rfind( "index.html" ) == string::npos
163         && itr->first.rfind( "index.htm" ) == string::npos )
164       {
165         ++m_unlinked_errors;
166         path full_path( fs::initial_path() / path(itr->first, fs::no_check) );
167         error( impute_library( full_path ), full_path, "unlinked file" );
168       }
169     }
170   }
171
172  } // namespace inspect
173} // namespace boost
174
Note: See TracBrowser for help on using the repository browser.