python学习第4天 python学习第4天

爬虫学习

# -*- coding: utf-8 -*- # @Time: 2019/7/31 11:28 # @Author: Eric Lee # @Email: li.yan_li@neusoft.com # @File: spider_dangdang.py # @Software: PyCharm import requests from lxml import html def spider_dangdang(isbn): # 目标站点地址 url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn) # print(url) # 获取站点str类型的响应 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}resp = requests.get(url, headers=headers) html_data = https://www.it610.com/article/resp.text #将html页面写入本地 # with open('dangdang.html', 'w', encoding='utf-8') as f: #f.write(html_data)# 提取目标站的信息 selector = html.fromstring(html_data) ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li') print('您好，共有{}家店铺售卖此图书'.format(len(ul_list)))# 遍历 ul_list for li in ul_list: #图书名称 title = li.xpath('./a/@title')[0].strip() print(title) #图书购买链接 link = li.xpath('a/@href')[0] print(link) #图书价格 price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0] price = float(price.replace('￥','')) print(price) # 图书卖家名称 store = li.xpath('./p[@class="search_shangjia"]/a/text()') # if len(store) == 0: #store = '当当自营' # else: #store = store[0] store = '当当自营' if len(store) == 0 else store[0] print(store)

XPath XPath 节点节点
在 XPath 中，有七种类型的节点：元素、属性、文本、命名空间、处理指令、注释以及文档（根）节点。XML 文档是被作为节点树来对待的。树的根被称为文档节点或者根节点。
请看下面这个 XML 文档：

Harry Potter - 锐客网 J K. Rowling 2005【python学习第4天】29.99

上面的XML文档中的节点例子：
(文档节点)
J K. Rowling (元素节点)
lang="en" (属性节点)
基本值（或称原子值，Atomic value）
基本值是无父或无子的节点。
基本值的例子：
J K. Rowling
"en"
项目（Item）
项目是基本值或者节点。
节点关系
父（Parent）
每个元素以及属性都有一个父。
在下面的例子中，book 元素是 title、author、year 以及 price 元素的父：

Harry Potter - 锐客网 J K. Rowling 200529.99

子（Children）
元素节点可有零个、一个或多个子。
在下面的例子中，title、author、year 以及 price 元素都是 book 元素的子：

Harry Potter - 锐客网 J K. Rowling 200529.99

同胞（Sibling）
拥有相同的父的节点
在下面的例子中，title、author、year 以及 price 元素都是同胞：

Harry Potter - 锐客网 J K. Rowling 2005 29.99

先辈（Ancestor）
某节点的父、父的父，等等。
在下面的例子中，title 元素的先辈是 book 元素和 bookstore 元素：

Harry Potter - 锐客网 J K. Rowling 200529.99

后代（Descendant）
某个节点的子，子的子，等等。
在下面的例子中，bookstore 的后代是 book、title、author、year 以及 price 元素：

Harry Potter - 锐客网 J K. Rowling 200529.99

选取节点

文章图片
选取节点方法电影top5

import requests from lxml import html import pandas as pd import jieba from matplotlib import pyplot as plt plt.rcParams["font.sans-serif"] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False def Film(): # 目标站点地址 url = 'https://movie.douban.com/cinema/later/chongqing/' header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"} resp = requests.get(url, headers=header) html_data = https://www.it610.com/article/resp.text # 提取目标站的信息 selector = html.fromstring(html_data) film = selector.xpath('//div[@id="showing-soon"]/div') print(film) div_list = [] for film_list in film: # 电影名 title_list = film_list.xpath('./div/h3/a/text()')[0] print(title_list) # 上映时间 time_list = film_list.xpath('./div/ul/li[1]/text()')[0] print(time_list) # 电影类型 type_list = film_list.xpath('./div/ul/li[2]/text()')[0] print(type_list) # 上映国家 con_list = film_list.xpath('./div/ul/li[3]/text()')[0] print(con_list) # 想看人数 number_list = film_list.xpath('./div/ul/li[4]/span/text()')[0] print(number_list) # 替换 number_list = int(number_list.replace('人想看','')) # 添加电影信息 div_list.append({ 'title': title_list, 'time': time_list, 'type': type_list, 'con': con_list, 'number': number_list }) # 按照想看人数排序 div_list.sort(key=lambda x:x['number'], reverse=True ) print(div_list) # 遍历 for items_list in div_list: print(items_list) # 绘制top5最想看的电影占比图 # 提取前五部电影信息 top5_store = [div_list[i] for i in range(5)] # 提取电影名 x = [x['title'] for x in top5_store] print(x) # 提取想看人数 y = [x['number'] for x in top5_store] print(y) explode = [0.1, 0, 0, 0, 0] plt.pie(y, explode=explode, labels=x, shadow=True, autopct='%1.1f%%') plt.axis('equal') plt.legend(loc=2) plt.show()# 绘制即将上映电影国家的占比图 counts = {} # 提取所有上映国家 s = [x['con'] for x in div_list] print(s) # 统计上映国家与数量 for word in s: counts[word] = counts.get(word, 0) + 1 print(counts) # 提取上映国家 name = counts.keys() print(name) # 提取数量 number = counts.values() print(number) explode1 = [0.1, 0, 0, 0] plt.pie(number, explode=explode1, labels=name, shadow=True, autopct='%1.1f%%') plt.axis('equal') plt.legend(loc=2) plt.show() Film()