使用Java代码找出网站中的死链出现的页面

需求:发现了网站中存在死链,找出死链出现的页面
所需jar包:org.jsoup:jsoup:1.13.1

package com.company;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class Main {
    private Main mainobj;
    //死链关键字
    private static final String[] errUrls = new String[]{
            "/member", "feedback.php"
    };
    private static final String baseUrl = "http://www.xxx.com";
    private static List checked;
    private static String lastURL;
    public static void main(String[] args) {
        checked = new ArrayList<>();
        check(baseUrl+"/");
    }

    public static void check(String url) {
        try {
            Document document = Jsoup.connect(url).get();
            Element bodyElement = document.body();
            String ownText=bodyElement.html();
            Elements aElements = bodyElement.getElementsByTag("a");
            for (Element element : aElements) {
                String ahref = element.attr("href");
                for (String errUrl:errUrls){
                    if (ahref.contains(errUrl)){
                        System.out.println("page: "+url+"  ||  err key:"+errUrl);
                    }
                    if (ownText.contains(errUrl)){
                        System.out.println("page: "+url+"  ||  err key:"+errUrl);
                    }
                }
                if (ahref.startsWith("javascript"))continue;
                //System.out.println("ahref:" + ahref);
                String currUrl = bulidUrl(ahref, url);
                if (currUrl == null) continue;
                if (checked.contains(currUrl)) {//如果已经存在则跳过
                    continue;
                }
                checked.add(currUrl);
                //跳过友链外链
                if (currUrl.startsWith("http")&&!currUrl.startsWith(baseUrl)){
                    continue;
                }
                lastURL=currUrl;
                check(currUrl);
            }

        } catch (IOException e) {
        }
    }

    /**
     * 拼合URL
     *
     * @param url
     * @param curr
     * @return
     */
    public static String bulidUrl(String url, String curr) {
        if (url == null || url.equals("")) return null;
        if (url.startsWith("http")) {
            return url;
        }
        if (url.startsWith("/")) {
            return baseUrl + url;
        }

        return curr + url;
    }
}

发表评论

CAPTCHAis initialing...