theWebcrawler


Before you begin review following topics:

and finally the design i nearly followed 
{ref: http://andreas-hess.info/programming/webcrawler/}



I decided to use strategy from start as i knew i will be looking for better n efficient ways to parse web page for href links.

Below is a Jsoup based strategy that extends interface "UrlParserStrategy".


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/**
 * 
 */
package com.kant.web.crawler.strategy;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @author shashi
 * 
 */
public class JsoupExecuter implements UrlParserStrategy {

 /*
  * (non-Javadoc)
  * 
  * @see
  * com.kant.web.crawler.strategy.ExecuterStrategy#execute(java.lang.String)
  */
 public List<String> fetchLinks(String url) {
  if (url == null || url.isEmpty())
   return null;
  Document doc = null;
  List<String> urls = null;
  try {
   // if (isValid(url)) {
   doc = Jsoup.connect(url).timeout(3000).get();
   urls = new ArrayList<String>();
   Elements links = doc.select("a[href]");
   for (Element link : links) {
    urls.add(link.attr("abs:href"));
   }
   // }
  } catch (IOException e) {
   System.out
     .println("[Skipping]:Non-textual webcontent or timeout has occured "
       + url);
   urls = null;
  }
  doc = null;
  return urls;
 }

 /**
  * for valid media types
  */
 // private boolean isValid(String webUrl) throws IOException {
 // URL url = null;
 // HttpURLConnection connection = null;
 // try {
 // url = new URL(webUrl);
 // connection = (HttpURLConnection) url.openConnection();
 // connection.setRequestMethod("HEAD");
 // connection.connect();
 // String contentType = connection.getContentType();
 // return contentType.contains("text/html;")
 // || contentType.contains("application/xml");
 //
 // } catch (MalformedURLException e) {
 // e.printStackTrace();
 // } finally {
 // connection.disconnect();
 // }
 //
 // return false;
 // }
}


Jsoup will parse the webpage and finds all the 'HREF' elements for inks defined on the page and store it a list and returns.

Then comes the JobController that contains main logic .

  • A blockingQueue is used as a shared resource  [thread safe java impl]
  • ExecutorService is used to create and maintain thread worker pool [java impl]
  • Each Job or queueMessage contains a url to fetch next level of links from and also stores current level/depth of search.
  • I added HashSet [java impl] to prevent duplicate jobs and loops.




 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/**
 * 
 */
package com.kant.web.crawler.controller;

import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;

import com.kant.web.crawler.model.JobFactory;
import com.kant.web.crawler.model.JobRunnable;
import com.kant.web.crawler.model.QueueMessage;
import com.kant.web.crawler.strategy.UrlParserStrategy;

/**
 * @author shashi
 * 
 */
public class JobController implements CrawlerController {
 private ExecutorService executorService;
 private BlockingQueue<QueueMessage> sharedQueue;
 private UrlParserStrategy parser;
 private final int endLevel = 5; // TODO make it configurable
 private Set<String> urlsCache;

 /**
  * @param executorService
  * @param sharedQueue
  * @param executor
  */
 public JobController(ExecutorService executorService,
   BlockingQueue<QueueMessage> sharedQueue, UrlParserStrategy parser) {
  this.executorService = executorService;
  this.sharedQueue = sharedQueue;
  this.parser = parser;
  urlsCache = new HashSet<String>();

 }

 /**
  * Processes only 3000 urls.
  * 
  * TODO make processing power configurable
  * 
  * @param baseUrl
  * @throws InterruptedException
  */
 public void processWebSite(String baseUrl) {
  try {
   sharedQueue.put(JobFactory.createJob(baseUrl, 1));
  } catch (InterruptedException e) {
   e.printStackTrace();
  }

  for (int i = 0; i < 3000; i++) {
   QueueMessage message;
   try {
    message = sharedQueue.poll(400, TimeUnit.MILLISECONDS);
   } catch (InterruptedException e) {
    message = null;
   }
   if (message != null) {
    if (message.getLevel() < endLevel) {
     if (!urlsCache.contains(message.getUrl())) {
      urlsCache.add(message.getUrl());
      executorService.execute(new JobRunnable(message,
        parser, sharedQueue));
     }
    } else if (message.getLevel() == endLevel) {
     System.out.println("[Processed/completed]: "
       + message.getUrl() + " [At depth]: " + endLevel);
    }

   }
   System.out.println("\n-------------" + i + "-------------\n");
  }
 }
}



and finally my Thread worker Job

uses the strategy to fetch urls and then adds the urls to shared Queue for further searching by JobCrawler



 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/**
 * 
 */
package com.kant.web.crawler.model;

import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;

import com.kant.web.crawler.strategy.UrlParserStrategy;

/**
 * @author shashi
 * 
 */
public class JobRunnable implements Runnable {
 private UrlParserStrategy parser;
 private QueueMessage job;
 private BlockingQueue<QueueMessage> sharedQ;

 public JobRunnable(QueueMessage job, UrlParserStrategy parser,
   BlockingQueue<QueueMessage> queueRef) {
  this.parser = parser;
  this.job = job;
  this.sharedQ = queueRef;
 }

 /**
  * 
  */
 public void run() {
  String url = job.getUrl();
  if (url != null && !url.isEmpty()) {
   System.out.println("[Processing]: " + url + " [At depth]: "
     + job.getLevel());
   List<String> result = parser.fetchLinks(url);
   int newLevel = job.getLevel() + 1;
   if (result != null) {
    for (String item : result) {
     if (item != null && !item.isEmpty()) {
      try {
       // sharedQ.put(JobFactory.createJob(item,
       // newLevel));
       sharedQ.offer(JobFactory.createJob(item, newLevel),
         100, TimeUnit.MILLISECONDS);
      } catch (InterruptedException e) {
       e.printStackTrace();
      }
     }
    }
   }
   System.out.println("[Completed]: " + url);
  }
 }
}



Download source code here:
https://github.com/thekant/theWebCrawler

Note: Used http://hilite.me/ for highlighting code [almost effortless to use]

FYI: http://java-source.net/open-source/crawlers

Comments