java 抓取网站数据

sweetbaybe 贡献于2012-03-05

作者 雨林木风  创建于2010-10-28 02:17:00   修改者雨林木风  修改于2010-10-28 02:20:00字数18295

文档摘要:java 抓取网站数据
关键词:

java 抓取网站数据 假设你需要获取51job 人才网上java 人才的需求数量,首先你需要分析51job 网站的搜索这 一块是怎么运作的,通过解析网页的源代码,我们发现了以下一些信息: 1. 搜索时页面请求的URL 是 http://search.51job.com/jobsearch/search_result.php 2. 请求所用的方法为:POST 3. 返回的页面的编码格式为:GBK 4. 假设我们想获取搜索java 人才时结果页面中显示的需求数量,我们发现数量位于返回的 HTML 数据中这样的一段代码之中:1-30 / 14794,于是我们可以得到这样的一个 模式:".+1-\d+ / (\d+).+",第一个分组的内容就是我们需要的最终数据,有关java 中的模式, 请参考java 文档中Pattern 类的介绍 5. 另外做为POST 请求,页面向服务器发送的数据如下(这个很容易能过prototype 这样的js 框架抓取到, 参考我的其它博客介绍) : lang=c&stype=1&postchannel=0000&fromType=1&line=&keywordtype=2&keyword=java&btnJ obarea=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&jobarea=0000&image=&btn Funtype=%E9%80%89%E6%8B%A9%2F%E4%BF%AE%E6%94%B9&funtype=0000&btnInd ustrytype=%E9%80%89%E6%8B%A9%2F%E4%BF%AE%E6%94%B9&industrytype=00 对于第5 条中的数据哪些是服务器真正需要的我们不管,全部发送过去就是了。有了这些准 备,我们就可以真正开始通过java 发送请求,并获得最终数据了。 我们定义Resource 类,这个类封装所有的与请求有关的信息,Resource 包括以下属性: view plaincopy to clipboardprint? /** * 需要获取资源的目标地址,不包含查询串 */ private String target; /** * get 请求时的查询串,或post 请求的请求数据 */ private String queryData = ""; /** * 请求方式,get / post */ private String method = "GET"; /** * 返回的数据的编码类型 */ private String charset = "GBK"; /** * 抓取数据的模式,将根据模式的分组来返回数据列表 */ private String pattern; /** * 需要获取资源的目标地址,不包含查询串 */ private String target; /** * get 请求时的查询串,或post 请求的请求数据 */ private String queryData = ""; /** * 请求方式,get / post */ private String method = "GET"; /** * 返回的数据的编码类型 */ private String charset = "GBK"; /** * 抓取数据的模式,将根据模式的分组来返回数据列表 */ private String pattern; 以下为抓取内容的代码: view plaincopy to clipboardprint? //假设以下代码中res 对象封装了所有的请求信息。 //URL 指向目的地。 //res.getTarget 返回目标地址,且当为get 请求时,这个地址包含了查询串的信息 URL url = new URL(res.getTarget()); HttpURLConnection con = (HttpURLConnection) url.openConnection(); //建立到目的地的联接 con.setRequestMethod(res.getMethod()); //设置请求的方法 //设置HTTP 请求头信息 con.setRequestProperty("accept", "*/*"); con.setRequestProperty("connection", "Keep-Alive"); con.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"); con.setDoInput(true); if (res.getMethod().equals("POST")) { //如果为Post 请求则发送请求数据 con.setDoOutput(true); con.getOutputStream().write(res.getQueryData().getBytes()); con.getOutputStream().flush(); } //通过BufferedReader 一行行的读取数据,如果你需要的是全部返回结果,可以修改一下这 里 BufferedReader br = new BufferedReader(new InputStreamReader( con.getInputStream(), res.getCharset())); Pattern pattern = Pattern.compile(res.getPattern()); String s = null; while ((s = br.readLine()) != null) { System.out.println(s); Matcher m = pattern.matcher(s); //检测当前行是否与要求结果的模式相匹配 boolean b = m.matches(); if (! b) { continue; } int size = m.groupCount(); List result = new ArrayList(size); for(int i=0; i result.add(m.group(i+1)); //如果有多个分组,则取出所有分组,并把最终 结果做为列表返回 } return result; } 远程抓取页面信息并解析XML XmlTransfer.java 负责链接对方服务器 package untitled1; import java.net.URL; import java.net.URLConnection; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.ProtocolException; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import org.w3c.dom.*; import javax.xml.parsers.*; public class XmlTransfer{ private String urlAddr; private String xmlStr; HttpURLConnection urlCon = null; public XmlTransfer(String _urlAddr,String _xmlStr) { this.urlAddr = _urlAddr; this.xmlStr = _xmlStr; } public InputStream get() throws Exception { if(urlCon==null){urlCon=getUrlConnection();} if(urlCon==null){throw new Exception("连接失败");} PrintWriter out = new PrintWriter(urlCon.getOutputStream()); out.print(xmlStr); out.flush(); out.close(); urlCon.disconnect(); InputStream fin1 = urlCon.getInputStream(); return fin1; } private HttpURLConnection getUrlConnection(){ try{ URL url = new URL(urlAddr); URLConnection conn = url.openConnection(); urlCon = (HttpURLConnection)conn; urlCon.setRequestProperty("Content-type", "text/html;charset=gb2312"); urlCon.setDoOutput(true); urlCon.setRequestMethod("GET"); urlCon.setUseCaches(false); } catch (MalformedURLException mex) { mex.printStackTrace(); } catch (ProtocolException pex) { pex.printStackTrace(); } catch (IOException iex) { iex.printStackTrace(); } return urlCon; } public static String getHttp( String strURL ){ XmlTransfer xt=new XmlTransfer(strURL,""); StringBuffer sb = new StringBuffer(); try{ InputStream is = xt.get(); byte[] b = new byte[1024]; int iCount = 0; while ((iCount = is.read(b)) > 0) { sb.append(new String(b, 0, iCount)); } }catch(Exception e){ sb.append("An error occurs in XmlTransfer.getHttp\n"); sb.append(e.getMessage()); } return (sb.toString()); } public static void main(String[] args) throws Exception { System.out.println( XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person") ); //http://192.168.0.110/testProvince.html",""); } } UsrDataSync.java 负责抓取页面 package untitled1; import java.util.Calendar; import java.util.TimerTask; import javax.servlet.ServletContext; import java.io.File; /** *

Title:

* *

Description:

* *

Copyright: Copyright (c) 2006

* *

Company:

* * @author not attributable * @version 1.0 */ public class UsrDataSync { public UsrDataSync() { } public static boolean doSync(){ String strXml; ParseXML px = new ParseXML(); strXml = XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person"); strXml = strXml.replaceAll("\r\n", ""); px.doParse(strXml); return false; } public static void main(String[] args) throws Exception { UsrDataSync dd= new UsrDataSync(); dd.doSync(); } } ParseXML.java 解析XML(包括正则表达式) //import java.awt.*; //import javax.servlet.*; //import javax.servlet.http.*; //import javax.servlet.jsp.*; //import org.apache.jasper.runtime.*; package usersync; import java.io.*; import java.util.*; import javax.xml.parsers.*; import org.w3c.dom.*; import java.net.URL; import java.net.URLConnection; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.ProtocolException; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import javax.swing.*; import java.sql.*; /** *

Title:

* *

Description:

* *

Copyright: Copyright (c) 2006

* *

Company:

* * @author not attributable * @version 1.0 */ public class ParseXML{ // StringBuffer os = new StringBuffer(); Document doc = null; public Connection con=null; public Connection con_history=null; public String doParse(String str) { try { DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); //InputStream is=xt.get(); doc = builder.parse(new ByteArrayInputStream(str.getBytes())); NodeList nl= doc.getElementsByTagName("person"); int i=0; int len=nl.getLength(); Element tempElement=null; while(i",""); department = department.replaceAll(".department\\sid..\\d..",""); department = department.replaceAll("",""); station = station.replaceAll(".station\\sid..\\d..",""); station = station.replaceAll("",""); state = state.replaceAll(".state\\sid..\\d..",""); state = state.replaceAll("",""); description = description.replaceAll(".description\\sid..\\d..",""); description = description.replaceAll("",""); description = description.replaceAll("",""); syncUser(Integer.parseInt(tempElement.getAttribute("id")), tempElement.getAttribute("name"), tempElement.getAttribute("logname"), duty, department, station, state, description); // syncUser(Integer.parseInt(tempElement.getAttribute("id")), // tempElement.getAttribute("name"), // tempElement.getAttribute("logname"), // nl.item(i).getChildNodes().item(1).getTextContent(), // nl.item(i).getChildNodes().item(3).getTextContent(), // nl.item(i).getChildNodes().item(5).getTextContent(), // nl.item(i).getChildNodes().item(7).getTextContent(), // nl.item(i).getChildNodes().item(9).getTextContent()); i++; } System.out.println(doc.toString()); //System.out.println(os.toString()); // System.out.println("==============" + System.currentTimeMillis() + "=============="); }catch(Exception e){ System.out.println(e.getMessage()); } return null; } private boolean syncUser(int uid, String usrname, String logname, String duty, String department, String station, String state, String description ){ int pos = 0;//0: new, 1:running, 2:history try{ Class.forName("com.mysql.jdbc.Driver"); con=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog","root","root"); con_history=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog_history","root","ro ot"); Statement st = con.createStatement(); Statement stt = con_history.createStatement(); //新表 ResultSet rs = st.executeQuery("select * from blogusers where id=" + Integer.toString(uid) ); //老表 ResultSet rss = stt.executeQuery("select * from blogusers where id=" + uid); if(rs.next()) { String dp = department.substring(0,2); if(dp.equals("中央")) { st.executeUpdate("update blogusers set id=" + uid + ",TRUENAME='" + usrname + "',DUTYNAME='" + duty + "'," + "FLAG=" + 1 + ",DEPMENT='"+department+"' where id=" + uid); }else{ st.executeUpdate("update blogusers set id=" + uid + ",TRUENAME='" + usrname + "',DUTYNAME='" + duty + "'," + "FLAG=" + 2 + ",DEPMENT='"+department+"' where id=" + uid); } //运行库 pos = 1; } else if(rss.next()) { st.executeUpdate("update blogusers set id=" + uid + ",TRUENAME='" + usrname + "',DUTYNAME='" + duty + "'," + "FLAG=" + 2 + ",DEPMENT='"+department+"' where id=" + uid); //老库 pos = 2; } if(state.equals("在职")) { switch( pos ){ //新库 case 0: //insert to running st.executeUpdate("insert into blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+ " values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央 ")==true?1:2)+",'"+department+"')"); break; //老库 case 2: //move from running to history moveUser(con, con_history, uid, usrname, logname, duty, department, station, state, description); break; } }else{ switch( pos ){ //新库 case 0: //insert to history stt.executeUpdate("insert into blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+ " values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央 ")==true?1:2)+",'"+department+"')"); break; //运行库 case 1: //move from history to running moveUser(con_history, con, uid, usrname, logname, duty, department, station, state, description ); break; } } // rs.close(); // st.close(); // con.close(); }catch(Exception e){ e.printStackTrace(); } return false; } private void moveUser(Connection src, Connection dest, int uid, String usrname, String logname, String duty, String department, String station, String state, String description ) throws SQLException { Statement st1=src.createStatement(); Statement st2=dest.createStatement(); //查询运行库 ResultSet rs1=st1.executeQuery("select * from blogusers where id="+uid); String s1=""; String s2=""; String s3=""; //String s4=""; //String s5=""; String s6=""; //String s7=""; String s8=""; if(rs1.next()) { s1=rs1.getString(1); s2=rs1.getString(2); s3=rs1.getString(3); //s4=rs1.getString(4); //s5=rs1.getString(5); s6=rs1.getString(6); //s7=rs1.getString(7); s8=rs1.getString(8); } //插入老库 st2.executeUpdate("insert into blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+ " values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央 ")==true?1:2)+",'"+department+"')"); //删除运行库记录 st1.executeUpdate("delete from blogusers where id=" + uid); } 抓取网页数据 暂时没有事情做,所以就研究一些小东东,以前经常听人家说抓取网站数据呀,感觉好牛呀, 所以自己也来研究一下下,只是没有成为牛人一组,写了一段代码,以后再慢慢的改,希望 能改成搜索引擎那样子,随意抓取各大网站数据。 //分析HTML 标签查找裢接 private string GetUrl(string strWebContent) { //string strRef = @"(href|HREF|src|SRC|action|ACTION|Action)[ ]*=[ ]*[""'][^""'#>]+[""']"; string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']"; string strResult = ""; MatchCollection matches = new Regex(strRef).Matches(strWebContent); for (int i = 0; i < matches.Count; i++) { strResult += matches[i].ToString().Replace("href=", "") + "\r\n"; } //strRef = @"[ ]*[""'][^""'#>]+[""']"; //matches = new Regex(strRef).Matches(strResult); //for (int i = 0; i < matches.Count; i++) //{ // strResult += matches[i].ToString() + "\r\n"; //} return strResult; } //提取URL 地址 private string GetUrl() { string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']"; Regex objRegExp = new Regex(strRef); return strRef; } /// /// 将Html 标签转化为空格 /// /// 待转化的字符串 /// 经过转化的字符串 private string stripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "<"); strOutput = strOutput.Replace(">", ">"); return strOutput; } //获得标题 private string GetTitle(string strWebContent) { //获取标题 Match TitleMatch = Regex.Match(strWebContent, "([^<]*)", RegexOptions.IgnoreCase | RegexOptions.Multiline); return TitleMatch.Groups[1].Value; } //获取描述信息 private string GetDescription(string strWebContent) { Match Desc = Regex.Match(strWebContent, "", RegexOptions.IgnoreCase | RegexOptions.Multiline); return Desc.Groups[1].Value; } //根据Url 地址得到网页的html 源码 private string GetWebContent(string Url) { string strResult = ""; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); //声明一个HttpWebRequest 请求 request.Timeout = 30000; //设置连接超时时间 request.Headers.Set("Pragma", "no-cache"); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream streamReceive = response.GetResponseStream(); Encoding encoding = Encoding.GetEncoding("GB2312"); StreamReader streamReader = new StreamReader(streamReceive, encoding); strResult += streamReader.ReadToEnd(); } catch { } return strResult; } Java 基础:利用HttpClient 获取网页内容 HTTP 协议是目前互联网上最重要的协议,许多软件与服务都需要依赖HTTP 协议。 虽然java.net 这个package 中包含了对HTTP 的基本支持,但还有很多高级和复杂的功能无 法实现,这不能不说是一个遗憾。 HttpClient 作为Apache 的开源项目项目之一,为基于HTTP 协议的操作提供了强大的客户端 执行支持,最新的版本为3.0RC3。 __________下面通过一个例子简要展示HttpClient 的使用方法: -------------------------------------------------------------------------------- import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; iimport java.io.UnsupportedEncodingException; import java.util.*; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpConnection; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; /** * @author steven */ public class HttpClientExample { //获得ConnectionManager,设置相关参数 private static MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager(); private static int connectionTimeOut = 20000; private static int socketTimeOut = 10000; private static int maxConnectionPerHost = 5; private static int maxTotalConnections = 40; //标志初始化是否完成的flag private static boolean initialed = false; //初始化ConnectionManger 的方法 public static void SetPara() { manager.getParams().setConnectionTimeout(connectionTimeOut); manager.getParams().setSoTimeout(socketTimeOut); manager.getParams() .setDefaultMaxConnectionsPerHost(maxConnectionPerHost); manager.getParams().setMaxTotalConnections(maxTotalConnections); initialed = true; } //通过get 方法获取网页内容 public static String getGetResponseWithHttpClient(String url, String encode) { HttpClient client = new HttpClient(manager); if (initialed) { HttpClientExample.SetPara(); } GetMethod get = new GetMethod(url); get.setFollowRedirects(true); String result = null; StringBuffer resultBuffer = new StringBuffer(); try { client.executeMethod(get); //在目标页面情况未知的条件下,不推荐使用getResponseBodyAsString()方法 //String strGetResponseBody = post.getResponseBodyAsString(); BufferedReader in = new BufferedReader( new InputStreamReader( get.getResponseBodyAsStream(), get.getResponseCharSet())); String inputLine = null; while ((inputLine = in.readLine()) != null) { resultBuffer.append(inputLine); resultBuffer.append("\n"); } in.close(); result = resultBuffer.toString(); //iso-8859-1 is the default reading encode result = HttpClientExample.ConverterStringCode(resultBuffer.toString(), get.getResponseCharSet(), encode); } catch (Exception e) { e.printStackTrace(); result = ""; } finally { get.releaseConnection(); return result; } } public static String getPostResponseWithHttpClient(String url, String encode) { HttpClient client = new HttpClient(manager); if (initialed) { HttpClientExample.SetPara(); } PostMethod post = new PostMethod(url); post.setFollowRedirects(false); StringBuffer resultBuffer = new StringBuffer(); String result = null; try { client.executeMethod(post); BufferedReader in = new BufferedReader( new InputStreamReader( post.getResponseBodyAsStream(), post.getResponseCharSet())); String inputLine = null; while ((inputLine = in.readLine()) != null) { resultBuffer.append(inputLine); resultBuffer.append("\n"); } in.close(); //iso-8859-1 is the default reading encode result = HttpClientExample.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode); } catch (Exception e) { e.printStackTrace(); result = ""; } finally { post.releaseConnection(); return result; } } public static String getPostResponseWithHttpClient(String url, String encode, NameValuePair[] nameValuePair) { HttpClient client = new HttpClient(manager); if (initialed) { HttpClientExample.SetPara(); } PostMethod post = new PostMethod(url); post.setRequestBody(nameValuePair); post.setFollowRedirects(false); String result = null; StringBuffer resultBuffer = new StringBuffer(); try { client.executeMethod(post); BufferedReader in = new BufferedReader( new InputStreamReader( post.getResponseBodyAsStream(), post.getResponseCharSet())); String inputLine = null; while ((inputLine = in.readLine()) != null) { resultBuffer.append(inputLine); resultBuffer.append("\n"); } in.close(); //iso-8859-1 is the default reading encode result = HttpClientExample.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode); } catch (Exception e) { e.printStackTrace(); result = ""; } finally { post.releaseConnection(); return result; } } private static String ConverterStringCode(String source, String srcEncode, String destEncode) { if (src != null) { try { return new String(src.getBytes(srcEncode), destEncode); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); return ""; } } else { return ""; } } } -------------------------------------------------------------------------------- 之后,就可以通过下面的代码获得目标网页: String source = HttpClientExample.getGetResponseWithHttpClient("www.sina.com.cn", "GBK"); 注意,在默认情况下,HttpClient 的Request 的Head 中 User-Agent 的值是Jakarta Commons-HttpClient 3.0RC1,如果需要改变它(例如,变为 Mozilla/4.0),必须在调用之前运行如下语句: System.getProperties().setProperty("httpclient.useragent", "Mozilla/4.0"); java 抓取网页乱码问题处理 String htmlContent = ""; java.io.InputStream inputStream; java.net.URL url = new java.net.URL("www.csdn.net "); java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url.openConnection(); connection.connect(); inputStream = connection.getInputStream(); byte bytes[] = new byte[1024*100]; int index = 0; int count = inputStream.read(bytes, index, 1024*100); while (count != -1) { index += count; count = inputStream.read(bytes, index, 1); } System.out.println (count); htmlContent = new String(bytes, "gb2312");// System.out.println(htmlContent);

下载文档到电脑,查找使用更方便

文档的实际排版效果,会与网站的显示效果略有不同!!

需要 5 金币 [ 分享文档获得金币 ] 10 人已下载

下载文档