java爬虫代码怎么实现 java实现爬虫技术( 二 )


Java操作Excel需要用到jxl
5、关键代码实现
public ListString getJobUrls(String gj,String city,String kd){
String pre_url="";
String end_url=".html";
String url;
if (gj.equals("")){
url=";city="+city+"needAddtionalResult=falsefirst=falsepn="+pn+"kd="+kd;
}else {
url=""+gj+"px=defaultcity="+city+"needAddtionalResult=falsefirst=falsepn="+pn+"kd="+kd;
}
String rs=getJson(url);
System.out.println(rs);
int total= JsonPath.read(rs,"$.content.positionResult.totalCount");//获取总数
int pagesize=total/15;
if (pagesize=30){
pagesize=30;
}
System.out.println(total);
// System.out.println(rs);
ListInteger posid=JsonPath.read(rs,"$.content.positionResult.result[*].positionId");//获取网页id
for (int j=1;j=pagesize;j++){//获取所有的网页id
pn++;//更新页数
url=""+gj+"px=defaultcity="+city+"needAddtionalResult=falsefirst=falsepn="+pn+"kd="+kd;
String rs2=getJson(url);
ListInteger posid2=JsonPath.read(rs2,"$.content.positionResult.result[*].positionId");
posid.addAll(posid2); //添加解析的id到第一个list
}
ListString joburls=new ArrayList();
//生成网页列表
for (int id:posid){
String url3=pre_url+id+end_url;
joburls.add(url3);
}
return joburls;
}
public Job getJob(String url){//获取工作信息
Job job=new Job();
Document document= null;
document = Jsoup.parse(getJson(url));
job.setJobname(document.select(".name").text());
job.setSalary(document.select(".salary").text());
String joball=HtmlTool.tag(document.select(".job_bt").select("div").html());//清除html标签
job.setJobdesc(joball);//职位描述包含要求
job.setCompany(document.select(".b2").attr("alt"));
Elements elements=document.select(".c_feature");
//System.out.println(document.select(".name").text());
job.setCompanysite(elements.select("a").attr("href")); //获取公司主页
job.setJobdsite(url);
return job;
}
void insertExcel(ListJob jobList) throws IOException, BiffException, WriteException {
int row=1;
Workbook wb = Workbook.getWorkbook(new File(JobCondition.filename));
WritableWorkbook book = Workbook.createWorkbook(new File(JobCondition.filename), wb);
WritableSheet sheet=book.getSheet(0);
for (int i=0;ijobList.size();i++){//遍历工作列表,一行行插入到表格中
sheet.addCell(new Label(0,row,jobList.get(i).getJobname()));
sheet.addCell(new Label(1,row,jobList.get(i).getSalary()));
sheet.addCell(new Label(2,row,jobList.get(i).getJobdesc()));
sheet.addCell(new Label(3,row,jobList.get(i).getCompany()));
sheet.addCell(new Label(4,row,jobList.get(i).getCompanysite()));
sheet.addCell(new Label(5,row,jobList.get(i).getJobdsite()));
row++;
}
book.write();
book.close();
}
如何用Java写一个爬虫我主要使用Jsoup解析,获取源码有时候使用Jsoup , 比较复杂的时候比如需要换ip,改编码或者模拟登陆的时候使用HttpClient,以下是抓取开源中国新闻的一段代码,可以运行 。
package demo;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* 使用JSoup 解析网页,语法使用 JS,css , Jquery 选择器语法,方便易懂
*
* Jsoup教程网:jsoup开发指南,jsoup中文使用手册,jsoup中文文档
*
* @author geekfly
*
*/
public class JsoupDemo {
public static void main(String[] args) throws IOException {
String url = "新闻资讯 - 开源中国社区";
Document document = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:30.0) Gecko/20100101 Firefox/30.0").get();

推荐阅读