Java多线程Web爬虫 Crawler4j

jopen 11年前

Java多线程Web爬虫 Crawler4j
Crawler4j是一个开源的Java Web爬虫,提供一个用于抓取Web页面的简单接口。您可以在5分钟内建立一个多线程的网络爬虫!

示例代码:

import java.util.ArrayList;  import java.util.regex.Pattern;    import edu.uci.ics.crawler4j.crawler.Page;  import edu.uci.ics.crawler4j.crawler.WebCrawler;  import edu.uci.ics.crawler4j.url.WebURL;    public class MyCrawler extends WebCrawler {            Pattern filters = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"                  + "|png|tiff?|mid|mp2|mp3|mp4"                  + "|wav|avi|mov|mpeg|ram|m4v|pdf"                  + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");                   public MyCrawler() {          }            public boolean shouldVisit(WebURL url) {                  String href = url.getURL().toLowerCase();                  if (filters.matcher(href).matches()) {                          return false;                  }                  if (href.startsWith("http://www.ics.uci.edu/")) {                          return true;                  }                  return false;          }            public void visit(Page page) {                  int docid = page.getWebURL().getDocid();                  String url = page.getWebURL().getURL();                          String text = page.getText();                  ArrayList<WebURL> links = page.getURLs();                        }  }

项目主页:http://www.open-open.com/lib/view/home/1350054122400