Java使用正则表达式爬取网站全部连接

package Regex; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 网络爬虫取连接然后逐个取出 * @param args */public class WebSpiderTest { /** * 获得urlStr对应的网页的源码内容 * @param urlStr * @return */ public static String getURLContent(String urlStr, String charset) { StringBuilder sb = new StringBuilder(); try { URL url = new URL(urlStr); //url.openStream()打开一个输入流 BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset))); String temp = ""; while((temp = reader.readLine()) != null) { sb.append(temp); //System.out.println(temp); //打印输入网站源码 } } catch (Exception e) { e.printStackTrace(); } return sb.toString(); } public static List getMatherSubstrs(String destStr, String regexStr) { //Pattern p = Pattern.compile(""); //取到所有超链接的a标签里的所有内容 Pattern p = Pattern.compile(regexStr); //取到超链接的地址 Matcher m = p.matcher(destStr); List result = new ArrayList(); while(m.find()) { result.add(m.group(1)); } return result; } public static void main(String[] args) { String destStr = getURLContent("https://www.163.com/", "gbk"); List result = getMatherSubstrs(destStr, "href=https://www.it610.com/"([\\w\\s./:]+?)\""); for(String temp : result) { System.out.println(temp); } }}

【Java使用正则表达式爬取网站全部连接】

    推荐阅读