001         package com.croftsoft.apps.spider;
002    
003         import java.io.*;
004         import java.net.*;
005         import java.util.*;
006         import javax.swing.text.MutableAttributeSet;
007         import javax.swing.text.html.HTML;
008         import javax.swing.text.html.HTMLEditorKit;
009         import javax.swing.text.html.parser.DocumentParser;
010         import javax.swing.text.html.parser.DTD;
011    
012         import com.croftsoft.core.lang.NullArgumentException;
013         import com.croftsoft.core.lang.lifecycle.Lifecycle;
014         import com.croftsoft.core.role.Filter;
015         import com.croftsoft.core.util.consumer.Consumer;
016    
017         /*********************************************************************
018         * Web spider.
019         * 
020         * @version
021         *   $Id: Spider.java,v 1.5 2008/09/26 20:27:32 croft Exp $
022         * @since
023         *   2003-04-09
024         * @author
025         *   <a href="https://www.croftsoft.com/">David Wallace Croft</a>
026         *********************************************************************/
027    
028         public final class  Spider
029           implements Lifecycle
030         //////////////////////////////////////////////////////////////////////
031         //////////////////////////////////////////////////////////////////////
032         {
033    
034         private static final String  THREAD_NAME    = "Spider";
035    
036         private static final long    DOWNLOAD_DELAY = 1000;
037    
038         //
039    
040         private final Consumer<URL>  urlConsumer;
041    
042         private final Stack<URL>     stack;
043    
044         private final Set<String>    knownSet;
045    
046         private final HTMLEditorKit.ParserCallback  parserCallback;
047    
048         //
049    
050         private Filter   urlFilter;
051    
052         private Filter   contentTypeFilter;
053    
054         private Thread   thread;
055    
056         private boolean  stopRequested;
057    
058         private URL      currentURL;
059    
060         //////////////////////////////////////////////////////////////////////
061         //////////////////////////////////////////////////////////////////////
062    
063         public static void  main ( String [ ]  args )
064           throws Exception
065         //////////////////////////////////////////////////////////////////////
066         {
067           Spider  spider = new Spider (
068             new Consumer<URL> ( )
069             {
070               public void  consume ( URL  url )
071               {
072                 System.out.println ( url );
073               }
074             } );
075    
076           SpiderUrlFilter  spiderUrlFilter = new SpiderUrlFilter ( spider );
077    
078           spiderUrlFilter.setSameHostOnly ( true );
079    
080           spiderUrlFilter.setSamePortOnly ( true );
081    
082           spider.setUrlFilter ( spiderUrlFilter );
083    
084           spider.setContentTypeFilter (
085             new Filter ( )
086             {
087               public boolean  isFiltrate ( Object  o )
088               {
089                 return ( ( String ) o ).equals ( "text/html" );
090               }
091             } );
092    
093           spider.push ( args [ 0 ] );
094    
095           spider.init ( );
096    
097           spider.start ( );
098         }
099    
100         //////////////////////////////////////////////////////////////////////
101         //////////////////////////////////////////////////////////////////////
102    
103         public  Spider (
104           Consumer<URL>  urlConsumer,
105           Filter         urlFilter,
106           Filter         contentTypeFilter )
107         //////////////////////////////////////////////////////////////////////
108         {
109           NullArgumentException.check ( this.urlConsumer = urlConsumer );
110    
111           setUrlFilter ( urlFilter );
112    
113           setContentTypeFilter ( contentTypeFilter );
114    
115           stack = new Stack<URL> ( );
116    
117           knownSet = new HashSet<String> ( );
118    
119           parserCallback = new HTMLEditorKit.ParserCallback ( )
120             {
121               @Override
122               public void  handleSimpleTag (
123                 HTML.Tag             t,
124                 MutableAttributeSet  a,
125                 int                  pos )
126               {
127                 if ( t == HTML.Tag.A )
128                 {
129                   push ( ( String ) a.getAttribute ( HTML.Attribute.HREF ) );
130                 }
131    
132                 super.handleSimpleTag ( t, a, pos );
133               }
134             };
135         }
136    
137         public  Spider ( Consumer<URL>  urlConsumer )
138         //////////////////////////////////////////////////////////////////////
139         {
140           this ( urlConsumer, null, null );
141         }
142    
143         //////////////////////////////////////////////////////////////////////
144         // accessor methods
145         //////////////////////////////////////////////////////////////////////
146    
147         public URL  getCurrentURL ( ) { return currentURL; }
148    
149         //////////////////////////////////////////////////////////////////////
150         // mutator methods
151         //////////////////////////////////////////////////////////////////////
152    
153         public boolean  push ( String  urlString )
154         //////////////////////////////////////////////////////////////////////
155         {
156           if ( urlString == null )
157           {
158             return false;
159           }
160    
161           URL  newURL = null;
162    
163           try
164           {
165             if ( !urlString.trim ( ).toLowerCase ( ).startsWith ( "http:" ) )
166             {
167               String  externalForm = currentURL.toExternalForm ( );
168    
169               if ( !externalForm.endsWith ( "/" ) )
170               {
171                 externalForm += "/";
172               }
173    
174               newURL = new URL ( new URL ( externalForm ), urlString );
175             }
176             else
177             {
178               newURL = new URL ( urlString );
179             }
180    
181             if ( newURL.getProtocol ( ).equals ( "http" )
182               && ( ( urlFilter == null )
183                 || urlFilter.isFiltrate ( newURL ) ) )
184             {
185               String  externalForm = newURL.toExternalForm ( );
186    
187               if ( externalForm.endsWith ( "/" ) )
188               {
189                 externalForm
190                   = externalForm.substring ( 0, externalForm.length ( ) - 1 );
191    
192                 newURL = new URL ( externalForm );
193               }
194    
195               // trim off the leading "http://"
196    
197               externalForm = externalForm.substring ( 7 );
198    
199               if ( !knownSet.contains ( externalForm ) )
200               {
201                 stack.push ( newURL );
202    
203                 knownSet.add ( externalForm );
204    
205                 return true;
206               }
207             }
208           }
209           catch ( MalformedURLException  ex )
210           {
211             // ignore
212           }
213    
214           return false;
215         }
216    
217         public void  setContentTypeFilter ( Filter  contentTypeFilter )
218         //////////////////////////////////////////////////////////////////////
219         {
220           this.contentTypeFilter = contentTypeFilter;
221         }
222    
223         public void  setUrlFilter ( Filter  urlFilter )
224         //////////////////////////////////////////////////////////////////////
225         {
226           this.urlFilter = urlFilter;
227         }
228    
229         //////////////////////////////////////////////////////////////////////
230         // interface Lifecycle methods
231         //////////////////////////////////////////////////////////////////////
232    
233         public void  init ( )
234         //////////////////////////////////////////////////////////////////////
235         {
236           // empty
237         }
238    
239         public synchronized void  start ( )
240         //////////////////////////////////////////////////////////////////////
241         {
242           stopRequested = false;
243    
244           if ( thread == null )
245           {
246             thread = new Thread (
247               new Runnable ( )
248               {
249                 public void  run ( )
250                 {
251                   loop ( );
252                 }
253               },
254               THREAD_NAME );
255    
256             thread.start ( );
257           }
258           else
259           {
260             notify ( );
261           }
262         }
263    
264         public synchronized void  stop ( )
265         //////////////////////////////////////////////////////////////////////
266         {
267           stopRequested = true;
268    
269           thread.interrupt ( );
270         }
271    
272         public synchronized void  destroy ( )
273         //////////////////////////////////////////////////////////////////////
274         {
275           thread = null;
276    
277           stopRequested = false;
278    
279           notify ( );
280         }
281    
282         //////////////////////////////////////////////////////////////////////
283         // private methods
284         //////////////////////////////////////////////////////////////////////
285    
286         private void  loop ( )
287         //////////////////////////////////////////////////////////////////////
288         {
289           while ( thread != null )
290           {
291             try
292             {
293               spiderNext ( );
294    
295               if ( stopRequested )
296               {
297                 synchronized ( this )
298                 {
299                   while ( stopRequested )
300                   {
301                     wait ( );
302                   }
303                 }
304               }
305             }
306             catch ( InterruptedException  ex )
307             {
308               // ignore
309             }
310           }
311         }
312    
313         private void  spiderNext ( )
314           throws InterruptedException
315         //////////////////////////////////////////////////////////////////////
316         {
317           if ( stack.size ( ) < 1 )
318           {
319             stop ( );
320    
321             destroy ( );
322           }
323    
324           Thread.sleep ( DOWNLOAD_DELAY );
325    
326           currentURL = stack.pop ( );
327    
328           try
329           {
330             HttpURLConnection  httpURLConnection
331               = ( HttpURLConnection ) currentURL.openConnection ( );
332    
333             String  contentType = null;
334    
335             try
336             {
337               contentType = httpURLConnection.getContentType ( );
338    
339               if ( contentType.equals ( "text/html" ) )
340               {
341                 BufferedReader  bufferedReader = new BufferedReader (
342                   new InputStreamReader ( currentURL.openStream ( ) ) );
343    
344                 try
345                 {
346                   DocumentParser  documentParser
347                     = new DocumentParser ( DTD.getDTD ( "html32" ) );
348    
349                   documentParser.parse (
350                     bufferedReader, parserCallback, true );
351                 }
352                 finally
353                 {
354                   bufferedReader.close ( );
355                 }
356               }
357             }
358             finally
359             {
360               httpURLConnection.disconnect ( );
361             }
362    
363             if ( ( contentTypeFilter == null )
364               || ( contentType != null )
365               && contentTypeFilter.isFiltrate ( contentType ) )
366             {
367               Thread.sleep ( DOWNLOAD_DELAY );
368    
369               urlConsumer.consume ( currentURL );
370             }
371           }
372           catch ( Exception  ex )
373           {
374             ex.printStackTrace ( );
375           }
376         }
377    
378         //////////////////////////////////////////////////////////////////////
379         //////////////////////////////////////////////////////////////////////
380         }