001 package com.croftsoft.apps.spider;
002
003 import java.io.*;
004 import java.net.*;
005 import java.util.*;
006 import javax.swing.text.MutableAttributeSet;
007 import javax.swing.text.html.HTML;
008 import javax.swing.text.html.HTMLEditorKit;
009 import javax.swing.text.html.parser.DocumentParser;
010 import javax.swing.text.html.parser.DTD;
011
012 import com.croftsoft.core.lang.NullArgumentException;
013 import com.croftsoft.core.lang.lifecycle.Lifecycle;
014 import com.croftsoft.core.role.Filter;
015 import com.croftsoft.core.util.consumer.Consumer;
016
017 /*********************************************************************
018 * Web spider.
019 *
020 * @version
021 * $Id: Spider.java,v 1.5 2008/09/26 20:27:32 croft Exp $
022 * @since
023 * 2003-04-09
024 * @author
025 * <a href="https://www.croftsoft.com/">David Wallace Croft</a>
026 *********************************************************************/
027
028 public final class Spider
029 implements Lifecycle
030 //////////////////////////////////////////////////////////////////////
031 //////////////////////////////////////////////////////////////////////
032 {
033
034 private static final String THREAD_NAME = "Spider";
035
036 private static final long DOWNLOAD_DELAY = 1000;
037
038 //
039
040 private final Consumer<URL> urlConsumer;
041
042 private final Stack<URL> stack;
043
044 private final Set<String> knownSet;
045
046 private final HTMLEditorKit.ParserCallback parserCallback;
047
048 //
049
050 private Filter urlFilter;
051
052 private Filter contentTypeFilter;
053
054 private Thread thread;
055
056 private boolean stopRequested;
057
058 private URL currentURL;
059
060 //////////////////////////////////////////////////////////////////////
061 //////////////////////////////////////////////////////////////////////
062
063 public static void main ( String [ ] args )
064 throws Exception
065 //////////////////////////////////////////////////////////////////////
066 {
067 Spider spider = new Spider (
068 new Consumer<URL> ( )
069 {
070 public void consume ( URL url )
071 {
072 System.out.println ( url );
073 }
074 } );
075
076 SpiderUrlFilter spiderUrlFilter = new SpiderUrlFilter ( spider );
077
078 spiderUrlFilter.setSameHostOnly ( true );
079
080 spiderUrlFilter.setSamePortOnly ( true );
081
082 spider.setUrlFilter ( spiderUrlFilter );
083
084 spider.setContentTypeFilter (
085 new Filter ( )
086 {
087 public boolean isFiltrate ( Object o )
088 {
089 return ( ( String ) o ).equals ( "text/html" );
090 }
091 } );
092
093 spider.push ( args [ 0 ] );
094
095 spider.init ( );
096
097 spider.start ( );
098 }
099
100 //////////////////////////////////////////////////////////////////////
101 //////////////////////////////////////////////////////////////////////
102
103 public Spider (
104 Consumer<URL> urlConsumer,
105 Filter urlFilter,
106 Filter contentTypeFilter )
107 //////////////////////////////////////////////////////////////////////
108 {
109 NullArgumentException.check ( this.urlConsumer = urlConsumer );
110
111 setUrlFilter ( urlFilter );
112
113 setContentTypeFilter ( contentTypeFilter );
114
115 stack = new Stack<URL> ( );
116
117 knownSet = new HashSet<String> ( );
118
119 parserCallback = new HTMLEditorKit.ParserCallback ( )
120 {
121 @Override
122 public void handleSimpleTag (
123 HTML.Tag t,
124 MutableAttributeSet a,
125 int pos )
126 {
127 if ( t == HTML.Tag.A )
128 {
129 push ( ( String ) a.getAttribute ( HTML.Attribute.HREF ) );
130 }
131
132 super.handleSimpleTag ( t, a, pos );
133 }
134 };
135 }
136
137 public Spider ( Consumer<URL> urlConsumer )
138 //////////////////////////////////////////////////////////////////////
139 {
140 this ( urlConsumer, null, null );
141 }
142
143 //////////////////////////////////////////////////////////////////////
144 // accessor methods
145 //////////////////////////////////////////////////////////////////////
146
147 public URL getCurrentURL ( ) { return currentURL; }
148
149 //////////////////////////////////////////////////////////////////////
150 // mutator methods
151 //////////////////////////////////////////////////////////////////////
152
153 public boolean push ( String urlString )
154 //////////////////////////////////////////////////////////////////////
155 {
156 if ( urlString == null )
157 {
158 return false;
159 }
160
161 URL newURL = null;
162
163 try
164 {
165 if ( !urlString.trim ( ).toLowerCase ( ).startsWith ( "http:" ) )
166 {
167 String externalForm = currentURL.toExternalForm ( );
168
169 if ( !externalForm.endsWith ( "/" ) )
170 {
171 externalForm += "/";
172 }
173
174 newURL = new URL ( new URL ( externalForm ), urlString );
175 }
176 else
177 {
178 newURL = new URL ( urlString );
179 }
180
181 if ( newURL.getProtocol ( ).equals ( "http" )
182 && ( ( urlFilter == null )
183 || urlFilter.isFiltrate ( newURL ) ) )
184 {
185 String externalForm = newURL.toExternalForm ( );
186
187 if ( externalForm.endsWith ( "/" ) )
188 {
189 externalForm
190 = externalForm.substring ( 0, externalForm.length ( ) - 1 );
191
192 newURL = new URL ( externalForm );
193 }
194
195 // trim off the leading "http://"
196
197 externalForm = externalForm.substring ( 7 );
198
199 if ( !knownSet.contains ( externalForm ) )
200 {
201 stack.push ( newURL );
202
203 knownSet.add ( externalForm );
204
205 return true;
206 }
207 }
208 }
209 catch ( MalformedURLException ex )
210 {
211 // ignore
212 }
213
214 return false;
215 }
216
217 public void setContentTypeFilter ( Filter contentTypeFilter )
218 //////////////////////////////////////////////////////////////////////
219 {
220 this.contentTypeFilter = contentTypeFilter;
221 }
222
223 public void setUrlFilter ( Filter urlFilter )
224 //////////////////////////////////////////////////////////////////////
225 {
226 this.urlFilter = urlFilter;
227 }
228
229 //////////////////////////////////////////////////////////////////////
230 // interface Lifecycle methods
231 //////////////////////////////////////////////////////////////////////
232
233 public void init ( )
234 //////////////////////////////////////////////////////////////////////
235 {
236 // empty
237 }
238
239 public synchronized void start ( )
240 //////////////////////////////////////////////////////////////////////
241 {
242 stopRequested = false;
243
244 if ( thread == null )
245 {
246 thread = new Thread (
247 new Runnable ( )
248 {
249 public void run ( )
250 {
251 loop ( );
252 }
253 },
254 THREAD_NAME );
255
256 thread.start ( );
257 }
258 else
259 {
260 notify ( );
261 }
262 }
263
264 public synchronized void stop ( )
265 //////////////////////////////////////////////////////////////////////
266 {
267 stopRequested = true;
268
269 thread.interrupt ( );
270 }
271
272 public synchronized void destroy ( )
273 //////////////////////////////////////////////////////////////////////
274 {
275 thread = null;
276
277 stopRequested = false;
278
279 notify ( );
280 }
281
282 //////////////////////////////////////////////////////////////////////
283 // private methods
284 //////////////////////////////////////////////////////////////////////
285
286 private void loop ( )
287 //////////////////////////////////////////////////////////////////////
288 {
289 while ( thread != null )
290 {
291 try
292 {
293 spiderNext ( );
294
295 if ( stopRequested )
296 {
297 synchronized ( this )
298 {
299 while ( stopRequested )
300 {
301 wait ( );
302 }
303 }
304 }
305 }
306 catch ( InterruptedException ex )
307 {
308 // ignore
309 }
310 }
311 }
312
313 private void spiderNext ( )
314 throws InterruptedException
315 //////////////////////////////////////////////////////////////////////
316 {
317 if ( stack.size ( ) < 1 )
318 {
319 stop ( );
320
321 destroy ( );
322 }
323
324 Thread.sleep ( DOWNLOAD_DELAY );
325
326 currentURL = stack.pop ( );
327
328 try
329 {
330 HttpURLConnection httpURLConnection
331 = ( HttpURLConnection ) currentURL.openConnection ( );
332
333 String contentType = null;
334
335 try
336 {
337 contentType = httpURLConnection.getContentType ( );
338
339 if ( contentType.equals ( "text/html" ) )
340 {
341 BufferedReader bufferedReader = new BufferedReader (
342 new InputStreamReader ( currentURL.openStream ( ) ) );
343
344 try
345 {
346 DocumentParser documentParser
347 = new DocumentParser ( DTD.getDTD ( "html32" ) );
348
349 documentParser.parse (
350 bufferedReader, parserCallback, true );
351 }
352 finally
353 {
354 bufferedReader.close ( );
355 }
356 }
357 }
358 finally
359 {
360 httpURLConnection.disconnect ( );
361 }
362
363 if ( ( contentTypeFilter == null )
364 || ( contentType != null )
365 && contentTypeFilter.isFiltrate ( contentType ) )
366 {
367 Thread.sleep ( DOWNLOAD_DELAY );
368
369 urlConsumer.consume ( currentURL );
370 }
371 }
372 catch ( Exception ex )
373 {
374 ex.printStackTrace ( );
375 }
376 }
377
378 //////////////////////////////////////////////////////////////////////
379 //////////////////////////////////////////////////////////////////////
380 }