爬取js渲染过的页面(爬取一个婚庆网站为例)


这个网站是js渲染过的,所以我们可以使用PhantomJS浏览器或者在network中找出需要post的qurrystring中的参数,发请求就可以了,得到的是json # !/usr/bin/python # -*- encoding: UTF-8 -*- from lxml import etree import urllib import urllib2 import jsonpath import json from lxml import etree class we(): def __init__(self): self.page=3 self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",} def meiyiye(self): item=[] headers=self.headers url="http://search.jiayuan.com/v2/search_v2.php" fromdata=https://www.it610.com/article/{"sex":"f", "key":"", "stc":"1%3A41%2C2%3A19.27%2C3%3A155.170%2C23%3A1", "sn":"default", "sv":"1", "p":self.page,#表示爬取的第几页 "f":"", "listStyle":"bigPhoto", "pri_uid":"170703614", "jsversion":"v5"} data = https://www.it610.com/article/urllib.urlencode(fromdata) request=urllib2.Request(url,data=data,headers=headers)#post请求的话需要data值,而get请求不需要data有值 response = urllib2.urlopen(request) #得到的是json格式的字符串(字典行的) html = response.read() html1=html.replace("##jiayser##","").replace("//","") #把json转换成python格式的unicode字符串(列表形式的) content = json.loads(html1) id_list=jsonpath.jsonpath(content,"$..uid")#在content中匹配出需要的个人id,然后通过这个id拼接出个人主页的链接 for i in id_list: item.append(i) self.page+=1 self.meiyigeren(item) #处理这一页每一个人的页面信息 def meiyigeren(self,item): for id in item: print "*******************************************" print u"用户id:"+str(id) url="http://www.jiayuan.com/"+str(id)+"?fxly=search_v2_index"#拼接连接,然后发送请求,找到个人主页中需要的有用的内容 print u"主页链接:"+url headers=self.headers request=urllib2.Request(url,headers=headers) response = urllib2.urlopen(request) html = response.read() content=etree.HTML(html)#解析HTML文档为HTML DOM模型,然后下面就可以使用xpath匹配出想要的内容 username=content.xpath('//div[@class="main_1000 bg_white mt15"]//h4/text()') if len(username)==1: print username[0] else: print u"没有名字" a=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/text()') we=" ".join(a) ni=we.replace(","," ").replace(',',' ') ha=ni.split(" ") print u"年龄:"+ha[0] header_url=content.xpath('//div[@class="big_pic fn-clear"]//li[2]//tr//img[@class="img_absolute"]//@_src') if len(header_url)==1: header_urll=header_url[0] else: header_urll=u"没有头像链接:" print u"头像链接:"+header_urll image_url=content.xpath('//div[@class="small_pic_box fn-clear"]//div[@class="small_pic fn-clear"]//li//img//@src') print u"相册链接:", print image_url content1=content.xpath('//div[@class="main_1000 mt15 fn-clear"]//div[@class="bg_white"]//div[@class="js_text"]//text()') content2="" for i in content1: content2+=i content3=content2 print u"内心独白:"+content3.strip() place=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/a[2]/text()') if len(place)==1: where=place[0] else: where=u"河南" print u"来自:"+where+u"省" xueli=content.xpath('//div[@class="main_1000 bg_white mt15"]//ul[@class="member_info_list fn-clear"]/li[1]//div[@class="fl pr"]/em/text()') if len(xueli)==1: print u"学历:"+xueli[0] else: print u"学历:本科" print "***********************************************" if self.page<=5: self.meiyiye() if __name__=="__main__": ni=we() ni.meiyiye()

    推荐阅读