001 package com.croftsoft.apps.spider; 002 003 import java.io.*; 004 import java.net.*; 005 import java.util.*; 006 import javax.swing.text.MutableAttributeSet; 007 import javax.swing.text.html.HTML; 008 import javax.swing.text.html.HTMLEditorKit; 009 import javax.swing.text.html.parser.DocumentParser; 010 import javax.swing.text.html.parser.DTD; 011 012 import com.croftsoft.core.lang.NullArgumentException; 013 import com.croftsoft.core.lang.lifecycle.Lifecycle; 014 import com.croftsoft.core.role.Filter; 015 import com.croftsoft.core.util.consumer.Consumer; 016 017 /********************************************************************* 018 * Web spider. 019 * 020 * @version 021 * $Id: Spider.java,v 1.5 2008/09/26 20:27:32 croft Exp $ 022 * @since 023 * 2003-04-09 024 * @author 025 * <a href="https://www.croftsoft.com/">David Wallace Croft</a> 026 *********************************************************************/ 027 028 public final class Spider 029 implements Lifecycle 030 ////////////////////////////////////////////////////////////////////// 031 ////////////////////////////////////////////////////////////////////// 032 { 033 034 private static final String THREAD_NAME = "Spider"; 035 036 private static final long DOWNLOAD_DELAY = 1000; 037 038 // 039 040 private final Consumer<URL> urlConsumer; 041 042 private final Stack<URL> stack; 043 044 private final Set<String> knownSet; 045 046 private final HTMLEditorKit.ParserCallback parserCallback; 047 048 // 049 050 private Filter urlFilter; 051 052 private Filter contentTypeFilter; 053 054 private Thread thread; 055 056 private boolean stopRequested; 057 058 private URL currentURL; 059 060 ////////////////////////////////////////////////////////////////////// 061 ////////////////////////////////////////////////////////////////////// 062 063 public static void main ( String [ ] args ) 064 throws Exception 065 ////////////////////////////////////////////////////////////////////// 066 { 067 Spider spider = new Spider ( 068 new Consumer<URL> ( ) 069 { 070 public void consume ( URL url ) 071 { 072 System.out.println ( url ); 073 } 074 } ); 075 076 SpiderUrlFilter spiderUrlFilter = new SpiderUrlFilter ( spider ); 077 078 spiderUrlFilter.setSameHostOnly ( true ); 079 080 spiderUrlFilter.setSamePortOnly ( true ); 081 082 spider.setUrlFilter ( spiderUrlFilter ); 083 084 spider.setContentTypeFilter ( 085 new Filter ( ) 086 { 087 public boolean isFiltrate ( Object o ) 088 { 089 return ( ( String ) o ).equals ( "text/html" ); 090 } 091 } ); 092 093 spider.push ( args [ 0 ] ); 094 095 spider.init ( ); 096 097 spider.start ( ); 098 } 099 100 ////////////////////////////////////////////////////////////////////// 101 ////////////////////////////////////////////////////////////////////// 102 103 public Spider ( 104 Consumer<URL> urlConsumer, 105 Filter urlFilter, 106 Filter contentTypeFilter ) 107 ////////////////////////////////////////////////////////////////////// 108 { 109 NullArgumentException.check ( this.urlConsumer = urlConsumer ); 110 111 setUrlFilter ( urlFilter ); 112 113 setContentTypeFilter ( contentTypeFilter ); 114 115 stack = new Stack<URL> ( ); 116 117 knownSet = new HashSet<String> ( ); 118 119 parserCallback = new HTMLEditorKit.ParserCallback ( ) 120 { 121 @Override 122 public void handleSimpleTag ( 123 HTML.Tag t, 124 MutableAttributeSet a, 125 int pos ) 126 { 127 if ( t == HTML.Tag.A ) 128 { 129 push ( ( String ) a.getAttribute ( HTML.Attribute.HREF ) ); 130 } 131 132 super.handleSimpleTag ( t, a, pos ); 133 } 134 }; 135 } 136 137 public Spider ( Consumer<URL> urlConsumer ) 138 ////////////////////////////////////////////////////////////////////// 139 { 140 this ( urlConsumer, null, null ); 141 } 142 143 ////////////////////////////////////////////////////////////////////// 144 // accessor methods 145 ////////////////////////////////////////////////////////////////////// 146 147 public URL getCurrentURL ( ) { return currentURL; } 148 149 ////////////////////////////////////////////////////////////////////// 150 // mutator methods 151 ////////////////////////////////////////////////////////////////////// 152 153 public boolean push ( String urlString ) 154 ////////////////////////////////////////////////////////////////////// 155 { 156 if ( urlString == null ) 157 { 158 return false; 159 } 160 161 URL newURL = null; 162 163 try 164 { 165 if ( !urlString.trim ( ).toLowerCase ( ).startsWith ( "http:" ) ) 166 { 167 String externalForm = currentURL.toExternalForm ( ); 168 169 if ( !externalForm.endsWith ( "/" ) ) 170 { 171 externalForm += "/"; 172 } 173 174 newURL = new URL ( new URL ( externalForm ), urlString ); 175 } 176 else 177 { 178 newURL = new URL ( urlString ); 179 } 180 181 if ( newURL.getProtocol ( ).equals ( "http" ) 182 && ( ( urlFilter == null ) 183 || urlFilter.isFiltrate ( newURL ) ) ) 184 { 185 String externalForm = newURL.toExternalForm ( ); 186 187 if ( externalForm.endsWith ( "/" ) ) 188 { 189 externalForm 190 = externalForm.substring ( 0, externalForm.length ( ) - 1 ); 191 192 newURL = new URL ( externalForm ); 193 } 194 195 // trim off the leading "http://" 196 197 externalForm = externalForm.substring ( 7 ); 198 199 if ( !knownSet.contains ( externalForm ) ) 200 { 201 stack.push ( newURL ); 202 203 knownSet.add ( externalForm ); 204 205 return true; 206 } 207 } 208 } 209 catch ( MalformedURLException ex ) 210 { 211 // ignore 212 } 213 214 return false; 215 } 216 217 public void setContentTypeFilter ( Filter contentTypeFilter ) 218 ////////////////////////////////////////////////////////////////////// 219 { 220 this.contentTypeFilter = contentTypeFilter; 221 } 222 223 public void setUrlFilter ( Filter urlFilter ) 224 ////////////////////////////////////////////////////////////////////// 225 { 226 this.urlFilter = urlFilter; 227 } 228 229 ////////////////////////////////////////////////////////////////////// 230 // interface Lifecycle methods 231 ////////////////////////////////////////////////////////////////////// 232 233 public void init ( ) 234 ////////////////////////////////////////////////////////////////////// 235 { 236 // empty 237 } 238 239 public synchronized void start ( ) 240 ////////////////////////////////////////////////////////////////////// 241 { 242 stopRequested = false; 243 244 if ( thread == null ) 245 { 246 thread = new Thread ( 247 new Runnable ( ) 248 { 249 public void run ( ) 250 { 251 loop ( ); 252 } 253 }, 254 THREAD_NAME ); 255 256 thread.start ( ); 257 } 258 else 259 { 260 notify ( ); 261 } 262 } 263 264 public synchronized void stop ( ) 265 ////////////////////////////////////////////////////////////////////// 266 { 267 stopRequested = true; 268 269 thread.interrupt ( ); 270 } 271 272 public synchronized void destroy ( ) 273 ////////////////////////////////////////////////////////////////////// 274 { 275 thread = null; 276 277 stopRequested = false; 278 279 notify ( ); 280 } 281 282 ////////////////////////////////////////////////////////////////////// 283 // private methods 284 ////////////////////////////////////////////////////////////////////// 285 286 private void loop ( ) 287 ////////////////////////////////////////////////////////////////////// 288 { 289 while ( thread != null ) 290 { 291 try 292 { 293 spiderNext ( ); 294 295 if ( stopRequested ) 296 { 297 synchronized ( this ) 298 { 299 while ( stopRequested ) 300 { 301 wait ( ); 302 } 303 } 304 } 305 } 306 catch ( InterruptedException ex ) 307 { 308 // ignore 309 } 310 } 311 } 312 313 private void spiderNext ( ) 314 throws InterruptedException 315 ////////////////////////////////////////////////////////////////////// 316 { 317 if ( stack.size ( ) < 1 ) 318 { 319 stop ( ); 320 321 destroy ( ); 322 } 323 324 Thread.sleep ( DOWNLOAD_DELAY ); 325 326 currentURL = stack.pop ( ); 327 328 try 329 { 330 HttpURLConnection httpURLConnection 331 = ( HttpURLConnection ) currentURL.openConnection ( ); 332 333 String contentType = null; 334 335 try 336 { 337 contentType = httpURLConnection.getContentType ( ); 338 339 if ( contentType.equals ( "text/html" ) ) 340 { 341 BufferedReader bufferedReader = new BufferedReader ( 342 new InputStreamReader ( currentURL.openStream ( ) ) ); 343 344 try 345 { 346 DocumentParser documentParser 347 = new DocumentParser ( DTD.getDTD ( "html32" ) ); 348 349 documentParser.parse ( 350 bufferedReader, parserCallback, true ); 351 } 352 finally 353 { 354 bufferedReader.close ( ); 355 } 356 } 357 } 358 finally 359 { 360 httpURLConnection.disconnect ( ); 361 } 362 363 if ( ( contentTypeFilter == null ) 364 || ( contentType != null ) 365 && contentTypeFilter.isFiltrate ( contentType ) ) 366 { 367 Thread.sleep ( DOWNLOAD_DELAY ); 368 369 urlConsumer.consume ( currentURL ); 370 } 371 } 372 catch ( Exception ex ) 373 { 374 ex.printStackTrace ( ); 375 } 376 } 377 378 ////////////////////////////////////////////////////////////////////// 379 ////////////////////////////////////////////////////////////////////// 380 }