Monday, 5 January 2015

Multi - Threadded Web - Crawler using Jsoup in java

If anyone can improve this code it will be pleasure for me being a beginner i tried my best to develop that crawler.

package tes;
import java.io.IOException;
import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class Nam implements Runnable{
   
        private Thread t;
   private String threadName;
  
   Nam( String name){
       threadName = name;
       System.out.println("Creating " +  threadName );
   }
   public void run() {
  String crawlUrl = this.threadName;
   int i=0;
    HashSet<String> anchors = new HashSet<String>();
    try
    {
        Document doc = Jsoup.connect(crawlUrl).get();
        Elements hrefs = doc.select("a");
        for( Element e : hrefs)
        {
            String anchor = e.attr("href").trim();
            anchors.add(anchor);
            System.out.println(anchor);
        }
    }
    catch(IOException ex)
    {
        Logger.getLogger(Tes.class.getName()).log(Level.SEVERE,null,ex);
    }
   
    System.out.println("--------------------");
   
    for( String s:anchors)
    {
         System.out.println(s);
         i++;
    }
       System.out.println("No of Crawled URL::"+i);

}

  
  
   public void start ()
   {
      System.out.println("Starting " +  threadName );
      if (t == null)
      {
         t = new Thread (this, threadName);
         t.start ();
      }
   
}
}


package tes;

public class Tes {
    public static void main(String[] args) {
       
  
      Nam R1 = new Nam( "http://www.yepme.com/");
      R1.start();
     
      Nam R2 = new Nam( "http://www.jabong.com/");
      R2.start();
}
    }