HTML 解析/提取器:woody

jopen 10年前

woody 是一款 Java 的HTML 解析/提取器,用法非常类似 webmagic, 是对其抽取模板完全重写,之所有单独提取出来是因为为来更好可重用。

一些新功能:

  • 多种结果数据类型(String, char, byte, short int, long, double, float, string[], Set, List,Data)
  • 支持用户之定义脚本处理函数(目前支持Javascript 函数配置处理)
  • 支持css,xpath内核替换
  • 支持filter功能
  • 对css,xpath 内核对象的缓存

一个完整的例子:

public class OsChinaBlog {     public static void main(String[] args) throws Exception {    Document doc = Jsoup.connect("http://www.oschina.net/news/43879/webmagic-0-3-0").timeout(60000)      .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20100101 Firefox/23.0").get();    String html = doc.html();    OsChinaBlogModel model = AnnotationExtractor.me().process(html, OsChinaBlogModel.class);    System.out.println(model.toJson());   }     public static class OsChinaBlogModel extends Model {      public OsChinaBlogModel() {     //use to reflect    }      @Inject    @ComboExtract(value = { @ExtractBy(value = "h1.OSCTitle", type = ExprType.CSS),      @ExtractBy(value = "//title/text()", type = ExprType.XPATH) }, op = OP.OR)    public String title;      @Inject    @ExtractBy(value = "div.PubDate a[href~=http://my\\.oschina\\.net/]", type = ExprType.CSS)    public String author;      @Inject    @ExtractBy(value = "发布于.\\s*(\\d+年\\d+月\\d+日)", type = ExprType.REGEX)    public Date publishDate;      @Inject    @ComboExtract(value = {      @ExtractBy(value = "div.PubDate", type = ExprType.CSS, setting = @Setting(outerHtml = true)),      @ExtractBy(value = "(\\d+)评", type = ExprType.REGEX) }, op = OP.AND)    public int commentNum;      @Inject    @ExtractBy(value = "span#p_favor_count", type = ExprType.CSS, setting = @Setting(function = @Function(value = "replace", args = {      "+", "" })))    public int collectNum;      @Inject    @ComboExtract(value = {      @ExtractBy(value = "div[id=userComments]", type = ExprType.CSS, setting = @Setting(outerHtml = true)),      @ExtractBy(value = "div.TextContent", type = ExprType.CSS) }, op = OP.AND, multi = true)    public List commentContents;      @Inject    @ExtractBy(value = "div[id=toolbar_wrapper]", setting = @Setting(fliters = { "b", "span" }), type = ExprType.CSS, impl = Document.class)    public String weibo;     }  }

项目主页:http://www.open-open.com/lib/view/home/1378731525709