Java操作Html文档利器---Jsoup

jopen 11年前

一、Jsoup是虾米东西了?这个不解释,给你列个场景你就知道,比喻你现在想通过java代码从www.baidu.com上获取百度icom的url,怎么办了?就必须得对html文档元素进行操作了。对!!Jsoup就是java中用来操作html文档的一个第三方类库。

 

二、Why Jsoup?用过Jquery的都知道,Jquery简单的语法超强的功能,让人用了就爱不释手,而Jsoup的用法就是使用类Jquery的用法,所以会jQuery的话,稍微看一下文档就会使用Jsoup了。

 

三、具体应用


    上面罗里吧嗦的简单介绍了一下jsoup,现在简单介绍一下在生产环境中的具体应用。小弟最近遇到的一个需求是通过iTunesId到苹果的网站上去获取这个应用的Icon和描述图片。具体的思路是:
1、从Jsoup的官方网站下载jsoup的jar,最新的版本为1.6.3,并且这个包是无依赖包,不需要和其他的jar包配合使用
2、加载文档
3、查找icon元素获取的地址
4、获取描述图片的元素并取得地址
业务并不复杂,贴一下代码,注释还算清楚。
package com.akwolf.quartz;    import java.io.ByteArrayOutputStream;  import java.io.IOException;  import java.util.ArrayList;  import java.util.List;    import org.apache.log4j.Logger;  import org.jdom.CDATA;  import org.jdom.output.Format;  import org.jdom.output.XMLOutputter;  import org.jsoup.Jsoup;  import org.jsoup.nodes.Document;  import org.jsoup.nodes.Element;  import org.jsoup.select.Elements;  import org.quartz.Job;  import org.quartz.JobExecutionContext;  import org.quartz.JobExecutionException;    public class AppleJob implements Job {     private static Logger logger = Logger.getLogger(AppleJob.class);     public static void main(String[] args) {    AppleJob aj = new AppleJob();    try {     String itunesId = "444934666";     try {      Document doc = aj.getHtmlDoc(itunesId);      logger.debug("itunesId : " + itunesId + " , doc : ");      // 如果不是一个有效的链接      if (doc == null || !aj.isCorrectItunesId(doc)) {       throw new IOException();      }      String params = aj.getProParameter(aj.getIconUrl(doc),        aj.getDescImgUrl(doc));      logger.debug("params : " + params);       } catch (IOException e) {      logger.warn("Can't connect to : " + itunesId);     }    } catch (Exception e) {     e.printStackTrace();     logger.warn("Has some Exception!!!");    }   }     @Override   public void execute(JobExecutionContext jec) throws JobExecutionException {    try {     String itunesId = "444934666";     try {      Document doc = this.getHtmlDoc(itunesId);      logger.debug("itunesId : " + itunesId + " , doc : ");      // 如果不是一个有效的链接      if (doc == null || !isCorrectItunesId(doc)) {       throw new IOException();      }      String params = this.getProParameter(this.getIconUrl(doc),        this.getDescImgUrl(doc));      logger.debug("params : " + params);       } catch (IOException e) {      logger.warn("Can't connect to : " + itunesId);     }    } catch (Exception e) {     e.printStackTrace();     logger.warn("Has some Exception!!!");     JobExecutionException jee = new JobExecutionException(e);     jee.refireImmediately();    }   }     /**    *     * 判断是否是一个有效的连接    *     * @param doc    * @return    */   private boolean isCorrectItunesId(Document doc) {    Elements footer = doc.select("p.footer");    if (footer == null || footer.size() == 0) {     return true;    }    return (!"http://www.apple.com.cn/itunes/download/".equals(footer      .first().select("a").first().absUrl("href")));   }     /**    * @param iconUrl    * @param descImgUrl    * @return    * @throws IOException    *     *     *             拼装出存储过程的参数    */   private String getProParameter(String iconUrl, List<String> descImgUrl)     throws IOException {    StringBuffer buffer = new StringBuffer();    org.jdom.Element root = new org.jdom.Element("params");    org.jdom.Document doc = new org.jdom.Document(root);    org.jdom.Element icon = new org.jdom.Element("IconUrl");    org.jdom.Element desc = new org.jdom.Element("DescUrl");    ByteArrayOutputStream out = new ByteArrayOutputStream();    for (int i = 0; i < descImgUrl.size(); i++) {     if (i == descImgUrl.size() - 1) {      buffer.append(descImgUrl.get(i));     } else {      buffer.append(descImgUrl.get(i)).append(",");     }    }    icon.addContent(new CDATA(iconUrl));    desc.addContent(new CDATA(buffer.toString()));    root.addContent(icon).addContent(desc);      Format format = Format.getRawFormat();    format.setOmitDeclaration(true);    XMLOutputter writer = new XMLOutputter(format);    writer.output(doc, out);    return out.toString();   }     /**    * @param itunesId    * @return    * @throws IOException    *     *             取得html文档    */   private Document getHtmlDoc(String itunesId) throws IOException {    Document doc = Jsoup.connect(      "http://itunes.apple.com/cn/app/id" + itunesId).get();    return doc;   }     /**    * @param doc    * @return    *     *         解析出app的icon图片    */   private String getIconUrl(Document doc) {    Element img = doc.select("#left-stack").first().select("img").first();    String src = img.attr("abs:src");    return src;   }     /**    * @param doc    * @return    *     *         解析出描述图片的url    */   private List<String> getDescImgUrl(Document doc) {    List<String> list = new ArrayList<String>();    Elements wrapper = doc.select("div.center-stack").first()      .select("img.portrait");    for (Element element : wrapper) {     list.add(element.attr("abs:src"));    }    return list;   }    }
结果
26 [main] DEBUG com.akwolf.quartz.AppleJob (2012-09-23 10:50:12,136) - params : <params><IconUrl><![CDATA[http://a3.mzstatic.com/us/r1000/104/Purple/v4/bb/71/3a/bb713a76-5492-9456-5114-63c19582b3c3/mza_297600500225037689.175x175-75.jpg]]></IconUrl><DescUrl><![CDATA[http://a5.mzstatic.com/us/r1000/080/Purple/v4/05/d6/d9/05d6d9d0-686f-63ad-7713-34b5b5aaea7f/mza_2616996279428462004.320x480-75.jpg,http://a2.mzstatic.com/us/r1000/067/Purple/v4/4d/9e/97/4d9e97ea-8367-764d-fceb-e607f55bf847/mza_7946357603060756898.320x480-75.jpg,http://a2.mzstatic.com/us/r1000/071/Purple/v4/21/7a/45/217a4551-d1b1-27e5-3a15-dec47a136069/mza_6366124751766881957.320x480-75.jpg,http://a5.mzstatic.com/us/r1000/065/Purple/v4/7b/5b/ad/7b5badf9-73ba-dd72-4dc9-95a671aa1306/mza_7359786108776115754.320x480-75.jpg,http://a4.mzstatic.com/us/r1000/112/Purple/v4/b7/19/25/b71925c1-a2a7-c69d-fd9c-d3368d0fb1b5/mza_8201700535726140806.320x480-75.jpg]]></DescUrl></params>
来自:http://blog.csdn.net/akwolf/article/details/8009226