简单的java爬虫抓取网页实现代码

nbnb的头像 nbnb 11 2015-01-04 21:20 2

 基本信息

× 1   

浏览数: 9722

分享时间: 4 年 前

19
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Robot {

    // robot url
    private List<String> urlList;
    // cache url
    private Queue<String> urlQueue;
    // define Host
    public final static String HOST = "debugs.tk";

    // constructor
    public Robot() {
        super();
        // initialization robot's member
        setUrlList(new LinkedList<String>());
        setUrlQueue(new LinkedList<String>());
    }

    // url
    public List<String> getUrlList() {
        return urlList;
    }

    public void setUrlList(List<String> urlList) {
        this.urlList = urlList;
    }

    // cache
    public Queue<String> getUrlQueue() {
        return urlQueue;
    }

    public void setUrlQueue(Queue<String> urlQueue) {
        this.urlQueue = urlQueue;
    }

    // Legal link
    private boolean isURL(String url) {
        try {
            // judge url
            Pattern pattern = Pattern.compile("^[a-zA-z]+://[^\\s]*");
            Matcher matcher = pattern.matcher(url);
            if (matcher.matches()) {
                return true;
            } else {
                return false;
            }
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
    }

    // whether the url is belong to host
    public static boolean isHost(String url) {
        return url.contains(HOST);
    }

    // travel all url
    public void traverse(String seed) {

        for (this.getUrlQueue().add(seed); !this.getUrlQueue().isEmpty();) {
            boolean flag = true;
            Document document = null;
            try {
                document = Jsoup.connect(seed).timeout(5000).get();
            } catch (IOException e) {
                e.printStackTrace();
                // whether connect success
                flag = false;
            }
            // whether connect success,then select a tag
            // add these aTag into queue
            if (flag) {
        // get url
                Elements elements = document.select("a[href]");
                for (Element e : elements) {
                    String s = e.attr("abs:href");
                    // Legal link and belong host
                    // and url not in list
                    // then add it
                    if (isURL(s) && s.contains(HOST)
                            && (!getUrlQueue().contains(s))
                            && (!getUrlList().contains(s))) {
                        this.getUrlQueue().add(s);
                    }
                }
            }
            // get head of queue
            // and set it seed
            // travel seed it again
            seed = this.getUrlQueue().poll();
            this.getUrlList().add(seed);
            // show information
            // System.out.println("SIZE:" 
            // + this.getUrlQueue().size() + "---"
            // + seed + " connect!");
        }
    }

    // public static void main(String[] args) {
    // Robot robot = new Robot();
    // robot.traverse("http://debugs.tk");
    // List<String> list = robot.getUrlList();
    // for (String s : list) {
    // System.out.println(s);
    // }
    // }

}


12 [下一页]

  • ossaa的头像 ossaa 2018-12-14 18:38 代码数:0

    It is imperative that we read blog post very carefully. I am already done it and find that this post is really amazing.

    nikasticshop.com

  • xu144227的头像 xu144227 2018-12-17 20:12 代码数:0

    怎么用

     

  • ossaa的头像 ossaa 2018-12-18 21:25 代码数:0

    Thank you so much for sharing this great blog.Very inspiring and helpful too.Hope you continue to share more of your ideas.I will definitely love to read.Krypto Index CIF200

  • ossaa的头像 ossaa 2018-12-25 21:24 代码数:0

    Nice blog and absolutely outstanding. You can do something much better but i still say this perfect.Keep trying for the best.

    private investigator

  • ossaa的头像 ossaa 2018-12-30 03:37 代码数:0

    If you set out to make me think today; mission accomplished!  I really like your writing style and how you express your ideas.  Thank you.

    tra giam can

  • 547240561的头像 547240561 2016-06-05 17:54 代码数:0

  • ossaa的头像 ossaa 2019-01-08 03:30 代码数:0

    I am happy to find this post Very useful for me, as it contains lot of information. I Always prefer to read The Quality and glad I found this thing in you post. Thanks

    Royal Home Painters Toronto

  • yamuna的头像 yamuna 2019-01-08 15:29 代码数:0

    This is a wonderful article, Given so much info in it, These type of articles keeps the users interest in the website, and keep on sharing more ... good luck

    <a href="https://games.lol/arcade/">arcade</a>

  • jhoneila的头像 jhoneila 2019-01-10 20:38 代码数:0

    Thank you for such a well written article.  It’s full of insightful information and entertaining descriptions.  Your point of view is the best among many. Wordzilla

  • ossaa的头像 ossaa 2019-01-10 21:19 代码数:0

    I can set up my new idea from this post. It gives in depth information. Thanks for this valuable information for all,..

    vipbox

您的评论: