需求:发现了网站中存在死链,找出死链出现的页面
所需jar包:org.jsoup:jsoup:1.13.1
package com.company;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Main {
private Main mainobj;
//死链关键字
private static final String[] errUrls = new String[]{
"/member", "feedback.php"
};
private static final String baseUrl = "http://www.xxx.com";
private static List checked;
private static String lastURL;
public static void main(String[] args) {
checked = new ArrayList<>();
check(baseUrl+"/");
}
public static void check(String url) {
try {
Document document = Jsoup.connect(url).get();
Element bodyElement = document.body();
String ownText=bodyElement.html();
Elements aElements = bodyElement.getElementsByTag("a");
for (Element element : aElements) {
String ahref = element.attr("href");
for (String errUrl:errUrls){
if (ahref.contains(errUrl)){
System.out.println("page: "+url+" || err key:"+errUrl);
}
if (ownText.contains(errUrl)){
System.out.println("page: "+url+" || err key:"+errUrl);
}
}
if (ahref.startsWith("javascript"))continue;
//System.out.println("ahref:" + ahref);
String currUrl = bulidUrl(ahref, url);
if (currUrl == null) continue;
if (checked.contains(currUrl)) {//如果已经存在则跳过
continue;
}
checked.add(currUrl);
//跳过友链外链
if (currUrl.startsWith("http")&&!currUrl.startsWith(baseUrl)){
continue;
}
lastURL=currUrl;
check(currUrl);
}
} catch (IOException e) {
}
}
/**
* 拼合URL
*
* @param url
* @param curr
* @return
*/
public static String bulidUrl(String url, String curr) {
if (url == null || url.equals("")) return null;
if (url.startsWith("http")) {
return url;
}
if (url.startsWith("/")) {
return baseUrl + url;
}
return curr + url;
}
}