需求:发现了网站中存在死链,找出死链出现的页面
所需jar包:org.jsoup:jsoup:1.13.1
package com.company; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class Main { private Main mainobj; //死链关键字 private static final String[] errUrls = new String[]{ "/member", "feedback.php" }; private static final String baseUrl = "http://www.xxx.com"; private static Listchecked; private static String lastURL; public static void main(String[] args) { checked = new ArrayList<>(); check(baseUrl+"/"); } public static void check(String url) { try { Document document = Jsoup.connect(url).get(); Element bodyElement = document.body(); String ownText=bodyElement.html(); Elements aElements = bodyElement.getElementsByTag("a"); for (Element element : aElements) { String ahref = element.attr("href"); for (String errUrl:errUrls){ if (ahref.contains(errUrl)){ System.out.println("page: "+url+" || err key:"+errUrl); } if (ownText.contains(errUrl)){ System.out.println("page: "+url+" || err key:"+errUrl); } } if (ahref.startsWith("javascript"))continue; //System.out.println("ahref:" + ahref); String currUrl = bulidUrl(ahref, url); if (currUrl == null) continue; if (checked.contains(currUrl)) {//如果已经存在则跳过 continue; } checked.add(currUrl); //跳过友链外链 if (currUrl.startsWith("http")&&!currUrl.startsWith(baseUrl)){ continue; } lastURL=currUrl; check(currUrl); } } catch (IOException e) { } } /** * 拼合URL * * @param url * @param curr * @return */ public static String bulidUrl(String url, String curr) { if (url == null || url.equals("")) return null; if (url.startsWith("http")) { return url; } if (url.startsWith("/")) { return baseUrl + url; } return curr + url; } }