theWebcrawler

Before you begin review following topics:

Jsoup Library
http://www.mkyong.com/java/jsoup-html-parser-hello-world-examples/
ThreadExecutorService basics
Strategy pattern refresher

and finally the design i nearly followed

{ref: http://andreas-hess.info/programming/webcrawler/}

I decided to use strategy from start as i knew i will be looking for better n efficient ways to parse web page for href links.
Below is a Jsoup based strategy that extends interface "UrlParserStrategy".

/**
 * 
 */
package com.kant.web.crawler.strategy;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @author shashi
 * 
 */
public class JsoupExecuter implements UrlParserStrategy {

 /*
  * (non-Javadoc)
  * 
  * @see
  * com.kant.web.crawler.strategy.ExecuterStrategy#execute(java.lang.String)
  */
 public List<String> fetchLinks(String url) {
  if (url == null || url.isEmpty())
   return null;
  Document doc = null;
  List<String> urls = null;
  try {
   // if (isValid(url)) {
   doc = Jsoup.connect(url).timeout(3000).get();
   urls = new ArrayList<String>();
   Elements links = doc.select("a[href]");
   for (Element link : links) {
    urls.add(link.attr("abs:href"));
   }
   // }
  } catch (IOException e) {
   System.out
     .println("[Skipping]:Non-textual webcontent or timeout has occured "
       + url);
   urls = null;
  }
  doc = null;
  return urls;
 }

 /**
  * for valid media types
  */
 // private boolean isValid(String webUrl) throws IOException {
 // URL url = null;
 // HttpURLConnection connection = null;
 // try {
 // url = new URL(webUrl);
 // connection = (HttpURLConnection) url.openConnection();
 // connection.setRequestMethod("HEAD");
 // connection.connect();
 // String contentType = connection.getContentType();
 // return contentType.contains("text/html;")
 // || contentType.contains("application/xml");
 //
 // } catch (MalformedURLException e) {
 // e.printStackTrace();
 // } finally {
 // connection.disconnect();
 // }
 //
 // return false;
 // }
}

Jsoup will parse the webpage and finds all the 'HREF' elements for inks defined on the page and store it a list and returns.

Then comes the JobController that contains main logic .

A blockingQueue is used as a shared resource [thread safe java impl]
ExecutorService is used to create and maintain thread worker pool [java impl]
Each Job or queueMessage contains a url to fetch next level of links from and also stores current level/depth of search.
I added HashSet [java impl] to prevent duplicate jobs and loops.

/**
 * 
 */
package com.kant.web.crawler.controller;

import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;

import com.kant.web.crawler.model.JobFactory;
import com.kant.web.crawler.model.JobRunnable;
import com.kant.web.crawler.model.QueueMessage;
import com.kant.web.crawler.strategy.UrlParserStrategy;

/**
 * @author shashi
 * 
 */
public class JobController implements CrawlerController {
 private ExecutorService executorService;
 private BlockingQueue<QueueMessage> sharedQueue;
 private UrlParserStrategy parser;
 private final int endLevel = 5; // TODO make it configurable
 private Set<String> urlsCache;

 /**
  * @param executorService
  * @param sharedQueue
  * @param executor
  */
 public JobController(ExecutorService executorService,
   BlockingQueue<QueueMessage> sharedQueue, UrlParserStrategy parser) {
  this.executorService = executorService;
  this.sharedQueue = sharedQueue;
  this.parser = parser;
  urlsCache = new HashSet<String>();

 }

 /**
  * Processes only 3000 urls.
  * 
  * TODO make processing power configurable
  * 
  * @param baseUrl
  * @throws InterruptedException
  */
 public void processWebSite(String baseUrl) {
  try {
   sharedQueue.put(JobFactory.createJob(baseUrl, 1));
  } catch (InterruptedException e) {
   e.printStackTrace();
  }

  for (int i = 0; i < 3000; i++) {
   QueueMessage message;
   try {
    message = sharedQueue.poll(400, TimeUnit.MILLISECONDS);
   } catch (InterruptedException e) {
    message = null;
   }
   if (message != null) {
    if (message.getLevel() < endLevel) {
     if (!urlsCache.contains(message.getUrl())) {
      urlsCache.add(message.getUrl());
      executorService.execute(new JobRunnable(message,
        parser, sharedQueue));
     }
    } else if (message.getLevel() == endLevel) {
     System.out.println("[Processed/completed]: "
       + message.getUrl() + " [At depth]: " + endLevel);
    }

   }
   System.out.println("\n-------------" + i + "-------------\n");
  }
 }
}

and finally my Thread worker Job

uses the strategy to fetch urls and then adds the urls to shared Queue for further searching by JobCrawler

/**
 * 
 */
package com.kant.web.crawler.model;

import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;

import com.kant.web.crawler.strategy.UrlParserStrategy;

/**
 * @author shashi
 * 
 */
public class JobRunnable implements Runnable {
 private UrlParserStrategy parser;
 private QueueMessage job;
 private BlockingQueue<QueueMessage> sharedQ;

 public JobRunnable(QueueMessage job, UrlParserStrategy parser,
   BlockingQueue<QueueMessage> queueRef) {
  this.parser = parser;
  this.job = job;
  this.sharedQ = queueRef;
 }

 /**
  * 
  */
 public void run() {
  String url = job.getUrl();
  if (url != null && !url.isEmpty()) {
   System.out.println("[Processing]: " + url + " [At depth]: "
     + job.getLevel());
   List<String> result = parser.fetchLinks(url);
   int newLevel = job.getLevel() + 1;
   if (result != null) {
    for (String item : result) {
     if (item != null && !item.isEmpty()) {
      try {
       // sharedQ.put(JobFactory.createJob(item,
       // newLevel));
       sharedQ.offer(JobFactory.createJob(item, newLevel),
         100, TimeUnit.MILLISECONDS);
      } catch (InterruptedException e) {
       e.printStackTrace();
      }
     }
    }
   }
   System.out.println("[Completed]: " + url);
  }
 }
}

Download source code here:
https://github.com/thekant/theWebCrawler

Note: Used http://hilite.me/ for highlighting code [almost effortless to use]

FYI: http://java-source.net/open-source/crawlers

What I learnt today!!!

Search This Blog

theWebcrawler

Comments

Post a Comment