java实现爬取指定网站的数据 2013-10

1.这个类是用来解析网站的内容
重点是："div#page>div#content>div#local>div#recommend>ul>li>a";
这里用用firefox的firebug组件查看网页的代码结构，不同的网页路径也不一样。

package zy.crawl.hupu; import java.io.IOException; import zy.crawl.common.*; import java.util.ArrayList; import java.util.List; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class CrawlHupu { private List newsList = new ArrayList<>(); //用来存储爬取的信息对象 public String GetHtml(String url) //还方法是设置网络链接，是固定的用法 { String html = null; HttpClient httpClient = new DefaultHttpClient(); //set proxy ,because of nsn //HttpHost proxy = new HttpHost("10.68.120.11", 3128); //httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); //configuration timeout httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000); HttpGet httpGet = new HttpGet(url); try { HttpResponse httpResponse = httpClient.execute(httpGet); int resStatu = httpResponse.getStatusLine().getStatusCode(); if(resStatu == HttpStatus.SC_OK) { HttpEntity entity = httpResponse.getEntity(); if(entity != null) { html = EntityUtils.toString(entity); } } } catch (Exception e) { System.out.println("Connect " + url+" error"); e.printStackTrace(); } finally { httpClient.getConnectionManager().shutdown(); } return html; } public void ParseHtmlForNewsList() { String html = GetHtml("http://qczx.qc1818.com/"); //hupu voice 的第一个可以暂时去掉一个css，这样就不用处理空格了 //String cssQueryHupu = "div.content>div.row>div.column>div.row>div.column>div.uibox>div.uibox-con>ul.ui-list>li>a"; String cssQueryHupu ="div#mainbody>div.cjkx_mtsd>div.cjkx>ul.list_left>li>a"; //这行是用来获取每条对象的标题信息 //String cssQueryHuxiu = "div.container-hx>div.row-fluid-wrap-hx>" //+ "div.center-container-hx>div.clearfix>div.center-ctr-wrap>div.center-ctr-box>div.article-list>div.article-box>div.article-box-ctt>h4>a"; // //String cssQueryIteye = "div#page>div#content>div#local>div#recommend>ul>li>a"; if(!html.isEmpty()) { Document doc = Jsoup.parse(html,"http://qczx.qc1818.com/"); Elements linkElements = doc.select(cssQueryHupu); /* *点击阅读 * 最后经过测试发现带空格的class可以写成两个select 写成 Elements indexEs = doc.select(".button").select(".read"); 成功抓取该书所有目录和链接。 */ //Elements linkElements = doc.select("div.hp-wrap").select("div.index-wrap>div.col-B>div.voice-main>div.public>div#J_public_item>ul>li>dl.item-bd>dt>span>a"); for(Element ele:linkElements) { NewsInfo newsTemp = new NewsInfo(ele.text(), ele.absUrl("href")); PaserHtmlForNewsContent(newsTemp.getHtmlAddr(),newsTemp); newsList.add(newsTemp); //String href = https://www.it610.com/article/ele.attr("abs:href"); 也可以获取绝对地址 //for test System.out.println(newsTemp.getTitle()+""+newsTemp.getHtmlAddr()); if(newsTemp.getImageAddrList() != null) System.out.println(newsTemp.getImageAddrList().get(0)); System.out.println(newsTemp.getContent()); }//System.out.println(newsList.get(0).getContent()); } } public void PaserHtmlForNewsContent(String contentHtmlAddr, NewsInfo newsTemp)//通过上面获得的标题信息的连接，抓取标题的正文部分。 { String html = GetHtml(contentHtmlAddr); String cssQueryphoto="asdfas"; String cssQueryContent = //"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailTitle"+ //+"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailIntr" "div#pageMain>div.pageMainLeft>div.detailWrap>div.detail"; //String cssQueryContent = "div.content>div.row>div.column>div#articlewrap.area"; //String cssQueryphoto = "div.hp-wrap>div.voice-main>div.voice-item>ul>li>div.voice-read-detailed>div.voice-photoVideo>" //+ "div.voice-photo>div.small-img>img"; if(!html.isEmpty()) { Document doc = Jsoup.parse(html); Elements contentElements = doc.select(cssQueryContent); Elements imgElements = doc.select(cssQueryphoto); for(Element ele:contentElements) { newsTemp.setContent(ele.html()); } for(Element ele:imgElements) { List tempImgList = new ArrayList<>(); tempImgList.add(ele.attr("src")); newsTemp.setImageAddrList(tempImgList); } } } public static void main(String[] args) { CrawlHupu crawlHupu = new CrawlHupu(); crawlHupu.ParseHtmlForNewsList(); } }

2.这个是要获取的信息的类。不多解释。

package zy.crawl.common; import java.util.List; public class NewsInfo { private String title; private String htmlAddr; private String content; private List imageAddrList; public NewsInfo(String title, String htmlAddr) { super(); this.title = title; this.htmlAddr = htmlAddr; } public NewsInfo(String content, List imageAddrList) { super(); this.content = content; this.imageAddrList = imageAddrList; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getHtmlAddr() { return htmlAddr; } public void setHtmlAddr(String htmlAddr) { this.htmlAddr = htmlAddr; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public List getImageAddrList() { return imageAddrList; } public void setImageAddrList(List imageAddrList) { this.imageAddrList = imageAddrList; } }