div#content>div#local>div#recommend>ul>li>a"; 这里用用firefox的firebug组件查看网页的代码结构,不同的网页路径也不一样。 package zy.crawl.hupu; import。java实现爬取指定网站的数据。" />

java实现爬取指定网站的数据

1.这个类是用来解析网站的内容
重点是:"div#page>div#content>div#local>div#recommend>ul>li>a";
这里用用firefox的firebug组件查看网页的代码结构,不同的网页路径也不一样。


package zy.crawl.hupu;

import java.io.IOException;

import zy.crawl.common.*;

import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class CrawlHupu
{
private List newsList = new ArrayList<>(); //用来存储爬取的信息对象

public String GetHtml(String url) //还方法是设置网络链接,是固定的用法
{
String html = null;
HttpClient httpClient = new DefaultHttpClient();
//set proxy ,because of nsn
//HttpHost proxy = new HttpHost("10.68.120.11", 3128);
//httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);

//configuration timeout
httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);

HttpGet httpGet = new HttpGet(url);
try
{
HttpResponse httpResponse = httpClient.execute(httpGet);
int resStatu = httpResponse.getStatusLine().getStatusCode();
if(resStatu == HttpStatus.SC_OK)
{
HttpEntity entity = httpResponse.getEntity();
if(entity != null)
{
html = EntityUtils.toString(entity);
}
}

}
catch (Exception e)
{
System.out.println("Connect " + url+" error");
e.printStackTrace();
}
finally
{
httpClient.getConnectionManager().shutdown();
}

return html;
}

public void ParseHtmlForNewsList()
{
String html = GetHtml("http://qczx.qc1818.com/");

//hupu voice 的第一个可以暂时去掉一个css,这样就不用处理空格了
//String cssQueryHupu = "div.content>div.row>div.column>div.row>div.column>div.uibox>div.uibox-con>ul.ui-list>li>a";
String cssQueryHupu ="div#mainbody>div.cjkx_mtsd>div.cjkx>ul.list_left>li>a"; //这行是用来获取每条对象的标题信息
//String cssQueryHuxiu = "div.container-hx>div.row-fluid-wrap-hx>"
//+ "div.center-container-hx>div.clearfix>div.center-ctr-wrap>div.center-ctr-box>div.article-list>div.article-box>div.article-box-ctt>h4>a";
//
//String cssQueryIteye = "div#page>div#content>div#local>div#recommend>ul>li>a";
if(!html.isEmpty())
{
Document doc = Jsoup.parse(html,"http://qczx.qc1818.com/");
Elements linkElements = doc.select(cssQueryHupu);
/*
*点击阅读
* 最后经过测试发现带空格的class可以写成两个select 写成 Elements indexEs = doc.select(".button").select(".read"); 成功抓取该书所有目录和链接。
*/

//Elements linkElements = doc.select("div.hp-wrap").select("div.index-wrap>div.col-B>div.voice-main>div.public>div#J_public_item>ul>li>dl.item-bd>dt>span>a");
for(Element ele:linkElements)
{

NewsInfo newsTemp = new NewsInfo(ele.text(), ele.absUrl("href"));

PaserHtmlForNewsContent(newsTemp.getHtmlAddr(),newsTemp);
newsList.add(newsTemp);
//String href = https://www.it610.com/article/ele.attr("abs:href"); 也可以获取绝对地址

//for test
System.out.println(newsTemp.getTitle()+""+newsTemp.getHtmlAddr());
if(newsTemp.getImageAddrList() != null)
System.out.println(newsTemp.getImageAddrList().get(0));
System.out.println(newsTemp.getContent());

}//System.out.println(newsList.get(0).getContent());

}
}

public void PaserHtmlForNewsContent(String contentHtmlAddr, NewsInfo newsTemp)//通过上面获得的标题信息的连接,抓取标题的正文部分。
{
String html = GetHtml(contentHtmlAddr);
String cssQueryphoto="asdfas";
String cssQueryContent = //"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailTitle"+
//+"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailIntr"
"div#pageMain>div.pageMainLeft>div.detailWrap>div.detail";
//String cssQueryContent = "div.content>div.row>div.column>div#articlewrap.area";
//String cssQueryphoto = "div.hp-wrap>div.voice-main>div.voice-item>ul>li>div.voice-read-detailed>div.voice-photoVideo>"
//+ "div.voice-photo>div.small-img>img";
if(!html.isEmpty())
{
Document doc = Jsoup.parse(html);
Elements contentElements = doc.select(cssQueryContent);
Elements imgElements = doc.select(cssQueryphoto);
for(Element ele:contentElements)
{
newsTemp.setContent(ele.html());
}
for(Element ele:imgElements)
{
List tempImgList = new ArrayList<>();
tempImgList.add(ele.attr("src"));
newsTemp.setImageAddrList(tempImgList);
}

}
}

public static void main(String[] args)
{
CrawlHupu crawlHupu = new CrawlHupu();
crawlHupu.ParseHtmlForNewsList();

}

}





2.这个是要获取的信息的类。不多解释。


package zy.crawl.common;

import java.util.List;

public class NewsInfo
{
private String title;
private String htmlAddr;
private String content;
private List imageAddrList;


public NewsInfo(String title, String htmlAddr)
{
super();
this.title = title;
this.htmlAddr = htmlAddr;
}


public NewsInfo(String content, List imageAddrList)
{
super();
this.content = content;
this.imageAddrList = imageAddrList;
}


public String getTitle()
{
return title;
}
public void setTitle(String title)
{
this.title = title;
}
public String getHtmlAddr()
{
return htmlAddr;
}
public void setHtmlAddr(String htmlAddr)
{
this.htmlAddr = htmlAddr;
}
public String getContent()
{
return content;
}
public void setContent(String content)
{
this.content = content;
}
public List getImageAddrList()
{
return imageAddrList;
}
public void setImageAddrList(List imageAddrList)
{
this.imageAddrList = imageAddrList;
}

}


    推荐阅读