java写爬虫代码 java爬虫入门教程

用java编写 网络爬虫求代码和流程 急import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
import javax.swing.table.*;//一个Web的爬行者(注:爬行在这里的意思与抓?。痘裣嗤?
public class SearchCrawler extends JFrame{
//最大URL保存值
private static final String[] MAX_URLS={"50","100","500","1000"};
//缓存robot禁止爬行列表
private HashMap disallowListCache=new HashMap();
//搜索GUI控件
private JTextField startTextField;
private JComboBox maxComboBox;
private JCheckBox limitCheckBox;
private JTextField logTextField;
private JTextField searchTextField;
private JCheckBox caseCheckBox;
private JButton searchButton;
//搜索状态GUI控件
private JLabel crawlingLabel2;
private JLabel crawledLabel2;
private JLabel toCrawlLabel2;
private JProgressBar progressBar;
private JLabel matchesLabel2;
//搜索匹配项表格列表
private JTable table;
//标记爬行机器是否正在爬行
private boolean crawling;
//写日志匹配文件的引用
private PrintWriter logFileWriter;
//网络爬行者的构造函数
public SearchCrawler(){
//设置应用程序标题栏
setTitle("搜索爬行者");
//设置窗体大小
setSize(600,600);
//处理窗体关闭事件
addWindowListener(new WindowAdapter(){
public void windowClosing(WindowEvent e){
actionExit();
}
});
//设置文件菜单
JMenuBar menuBar=new JMenuBar();
JMenu fileMenu=new JMenu("文件");
fileMenu.setMnemonic(KeyEvent.VK_F);
JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X);
fileExitMenuItem.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e){
actionExit();
}
});
fileMenu.add(fileExitMenuItem);
menuBar.add(fileMenu);
setJMenuBar(menuBar);
如何用Java写一个爬虫1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import java.io.File;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DownMM {
public static void main(String[] args) throws Exception {
//out为输出的路径,注意要以\\结尾
String out = "D:\\JSP\\pic\\java\\";
try{
File f = new File(out);
if(! f.exists()) {
f.mkdirs();
}
}catch(Exception e){
System.out.println("no");
}
String url = "-";
Pattern reg = Pattern.compile("img src=https://www.04ip.com/"(.*?)\"");
for(int j=0, i=1; i=10; i++){
URL uu = new URL(url+i);
URLConnection conn = uu.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko");
Scanner sc = new Scanner(conn.getInputStream());
Matcher m = reg.matcher(sc.useDelimiter("\\A").next());
while(m.find()){
Files.copy(new URL(m.group(1)).openStream(), Paths.get(out + UUID.randomUUID() + ".jpg"));
System.out.println("已下载:"+j++);
}
}
}
Java网络爬虫怎么实现?网络爬虫是一个自动提取网页的程序,它为搜索引擎从万维网上下载网页,是搜索引擎的重要组成 。\x0d\x0a传统爬虫从一个或若干初始网页的URL开始,获得初始网页上的URL,在抓取网页的过程中,不断从当前页面上抽取新的URL放入队列,直到满足系统的一定停止条件 。对于垂直搜索来说,聚焦爬虫,即有针对性地爬取特定主题网页的爬虫,更为适合 。\x0d\x0a\x0d\x0a以下是一个使用java实现的简单爬虫核心代码:\x0d\x0apublic void crawl() throws Throwable {\x0d\x0awhile (continueCrawling()) {\x0d\x0aCrawlerUrl url = getNextUrl(); //获取待爬取队列中的下一个URL\x0d\x0aif (url != null) {\x0d\x0aprintCrawlInfo();\x0d\x0aString content = getContent(url); //获取URL的文本信息\x0d\x0a\x0d\x0a//聚焦爬虫只爬取与主题内容相关的网页,这里采用正则匹配简单处理\x0d\x0aif (isContentRelevant(content, this.regexpSearchPattern)) {\x0d\x0asaveContent(url, content); //保存网页至本地\x0d\x0a\x0d\x0a//获取网页内容中的链接 , 并放入待爬取队列中\x0d\x0aCollection urlStrings = extractUrls(content, url);\x0d\x0aaddUrlsToUrlQueue(url, urlStrings);\x0d\x0a} else {\x0d\x0aSystem.out.println(url + " is not relevant ignoring ...");\x0d\x0a}\x0d\x0a\x0d\x0a//延时防止被对方屏蔽\x0d\x0aThread.sleep(this.delayBetweenUrls);\x0d\x0a}\x0d\x0a}\x0d\x0acloseOutputStream();\x0d\x0a}\x0d\x0aprivate CrawlerUrl getNextUrl() throws Throwable {\x0d\x0aCrawlerUrl nextUrl = null;\x0d\x0awhile ((nextUrl == null)(!urlQueue.isEmpty())) {\x0d\x0aCrawlerUrl crawlerUrl = this.urlQueue.remove();\x0d\x0a//doWeHavePermissionToVisit:是否有权限访问该URL,友好的爬虫会根据网站提供的"Robot.txt"中配置的规则进行爬取\x0d\x0a//isUrlAlreadyVisited:URL是否访问过 , 大型的搜索引擎往往采用BloomFilter进行排重,这里简单使用HashMap\x0d\x0a//isDepthAcceptable:是否达到指定的深度上限 。爬虫一般采取广度优先的方式 。一些网站会构建爬虫陷阱(自动生成一些无效链接使爬虫陷入死循环),采用深度限制加以避免\x0d\x0aif (doWeHavePermissionToVisit(crawlerUrl)\x0d\x0a(!isUrlAlreadyVisited(crawlerUrl))\x0d\x0aisDepthAcceptable(crawlerUrl)) {\x0d\x0anextUrl = crawlerUrl;\x0d\x0a// System.out.println("Next url to be visited is " + nextUrl);\x0d\x0a}\x0d\x0a}\x0d\x0areturn nextUrl;\x0d\x0a}\x0d\x0aprivate String getContent(CrawlerUrl url) throws Throwable {\x0d\x0a//HttpClient4.1的调用与之前的方式不同\x0d\x0aHttpClient client = new DefaultHttpClient();\x0d\x0aHttpGet httpGet = new HttpGet(url.getUrlString());\x0d\x0aStringBuffer strBuf = new StringBuffer();\x0d\x0aHttpResponse response = client.execute(httpGet);\x0d\x0aif (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) {\x0d\x0aHttpEntity entity = response.getEntity();\x0d\x0aif (entity != null) {\x0d\x0aBufferedReader reader = new BufferedReader(\x0d\x0anew InputStreamReader(entity.getContent(), "UTF-8"));\x0d\x0aString line = null;\x0d\x0aif (entity.getContentLength()0) {\x0d\x0astrBuf = new StringBuffer((int) entity.getContentLength());\x0d\x0awhile ((line = reader.readLine()) != null) {\x0d\x0astrBuf.append(line);\x0d\x0a}\x0d\x0a}\x0d\x0a}\x0d\x0aif (entity != null) {\x0d\x0ansumeContent();\x0d\x0a}\x0d\x0a}\x0d\x0a//将url标记为已访问\x0d\x0amarkUrlAsVisited(url);\x0d\x0areturn strBuf.toString();\x0d\x0a}\x0d\x0apublic static boolean isContentRelevant(String content,\x0d\x0aPattern regexpPattern) {\x0d\x0aboolean retValue = https://www.04ip.com/post/false;/x0d/x0aif (content != null) {/x0d/x0a//是否符合正则表达式的条件/x0d/x0aMatcher m = regexpPattern.matcher(content.toLowerCase());/x0d/x0aretValue = m.find();/x0d/x0a}/x0d/x0areturn retValue;/x0d/x0a}/x0d/x0apublic List extractUrls(String text, CrawlerUrl crawlerUrl) {/x0d/x0aMap urlMap = new HashMap();/x0d/x0aextractHttpUrls(urlMap, text);/x0d/x0aextractRelativeUrls(urlMap, text, crawlerUrl);/x0d/x0areturn new ArrayList(urlMap.keySet());/x0d/x0a}/x0d/x0aprivate void extractHttpUrls(Map urlMap, String text) {/x0d/x0aMatcher m = (text);/x0d/x0awhile (m.find()) {/x0d/x0aString url = m.group();/x0d/x0aString[] terms = url.split("a href=https://www.04ip.com/"");\x0d\x0afor (String term : terms) {\x0d\x0a// System.out.println("Term = " + term);\x0d\x0aif (term.startsWith("http")) {\x0d\x0aint index = term.indexOf("\"");\x0d\x0aif (index0) {\x0d\x0aterm = term.substring(0, index);\x0d\x0a}\x0d\x0aurlMap.put(term, term);\x0d\x0aSystem.out.println("Hyperlink: " + term);\x0d\x0a}\x0d\x0a}\x0d\x0a}\x0d\x0a}\x0d\x0aprivate void extractRelativeUrls(Map urlMap, String text,\x0d\x0aCrawlerUrl crawlerUrl) {\x0d\x0aMatcher m = relativeRegexp.matcher(text);\x0d\x0aURL textURL = crawlerUrl.getURL();\x0d\x0aString host = textURL.getHost();\x0d\x0awhile (m.find()) {\x0d\x0aString url = m.group();\x0d\x0aString[] terms = url.split("a href=https://www.04ip.com/"");\x0d\x0afor (String term : terms) {\x0d\x0aif (term.startsWith("/")) {\x0d\x0aint index = term.indexOf("\"");\x0d\x0aif (index0) {\x0d\x0aterm = term.substring(0, index);\x0d\x0a}\x0d\x0aString s = //" + host + term;\x0d\x0aurlMap.put(s, s);\x0d\x0aSystem.out.println("Relative url: " + s);\x0d\x0a}\x0d\x0a}\x0d\x0a}\x0d\x0a\x0d\x0a}\x0d\x0apublic static void main(String[] args) {\x0d\x0atry {\x0d\x0aString url = "";\x0d\x0aQueue urlQueue = new LinkedList();\x0d\x0aString regexp = "java";\x0d\x0aurlQueue.add(new CrawlerUrl(url, 0));\x0d\x0aNaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L,\x0d\x0aregexp);\x0d\x0a// boolean allowCrawl = crawler.areWeAllowedToVisit(url);\x0d\x0a// System.out.println("Allowed to crawl: " + url + " " +\x0d\x0a// allowCrawl);\x0d\x0acrawler.crawl();\x0d\x0a} catch (Throwable t) {\x0d\x0aSystem.out.println(t.toString());\x0d\x0at.printStackTrace();\x0d\x0a}\x0d\x0a}

推荐阅读