BeautifulSoup

小案例

# pip3 install beautifulsoup4from bs4 import BeautifulSoup import requests,re#http://all.hengyan.com/1/0_0_0_0_0_0_0_0_0_1.aspx #http://all.hengyan.com/1/0_0_0_0_0_0_0_0_0_2.aspx #http://all.hengyan.com/1/0_0_0_0_0_0_0_0_0_3.aspx #http://all.hengyan.com/1/0_0_0_0_0_0_0_0_0_100.aspxclass HengYanSpider(object):def __init__(self): self.first_url = 'http://all.hengyan.com/1/0_0_0_0_0_0_0_0_0_1.aspx' self.default_headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' }def get_noval_url(self,url=None): url = self.first_url if not url else url """获取小说详情的url地址""" html = self.send_request(url) if html: ###使用bs4解析数据 bs_soup = BeautifulSoup(html,'lxml') lis = bs_soup.find_all(name='li',attrs={'class':'bookname'}) print(lis) for li in lis: a_list = li.select('a') if len(a_list) > 0: url = a_list[0].attrs['href'] print(url) self.get_noval_detail(url) else: print('数据获取失败')def get_noval_detail(self,noval_url): """获取书籍详情的页面内容,解析数据""" html = self.send_request(noval_url) if html: print('得到了详情页面') ####使用bs4解析 noval_dict = {} bs_soup = BeautifulSoup(html,'lxml') #//div[@class="dh"]/p/label/text() # #书号 book_id = bs_soup.select('div.dh p label')[0].get_text() noval_dict['book_id'] = re.search('\d+',book_id).group() # #火车票 # //div[@class="piao"]/p[2]/span[@class="huocolor"]/text() noval_dict['hot_track'] = bs_soup.select('.piao p')[1].select('.huocolor')[0].get_text() print(noval_dict) # print(noval_dict) # self.save_data(noval_dict)def save_data(self,noval_dict): """保存数据""" passdef extract_first(self,data,default=''): if len(data) > 0: return data[0] return defaultdef send_request(self,url,header=None,data=https://www.it610.com/article/None,method="GET"): """发送请求""" header = self.default_headers if not header else headerif method == 'GET': #发送get请求 response = requests.get(url=url,params=data,headers=header) else: #发送post请求 response = requests.post(url=url, data=https://www.it610.com/article/data, headers=header)if response.status_code == 200: #请求成功,返回页面源码 return response.textif __name__ =='__main__':spider = HengYanSpider() spider.get_noval_url()

    推荐阅读