大师兄的Python学习笔记(二十二):|大师兄的Python学习笔记(二十二): 爬虫(三)
师兄的Python学习笔记(二十一): 爬虫(二)
大师兄的Python学习笔记(二十三): 爬虫(四)
四、保存数据
- 数据在提取后,即可以保存在文件中,也可以保存在数据库中。
- 存储到txt文件是最基础的数据保存方式。
>>>from pyquery import PyQuery as pq
>>>import requests>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_txt(data,filename="douban_top10.txt"):
>>># 保存到txt文档
>>>with open(filename,'w',encoding='utf-8') as file:
>>>for i in range(10):
>>>line = f'{" ".join(list(next(data)))}\n'
>>>file.write(line)>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_txt(page_data)>>>if __name__ == '__main__':
>>>main()
4.1.2 存储为json文件
- 大师兄的Python学习笔记(十九): Python与(XML和JSON)
- json格式非常方便前端使用。
>>>from pyquery import PyQuery as pq
>>>import requests,json>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_json(data,filename="douban_top10.json"):
>>># 保存到json文档
>>>with open(filename,'w',encoding='utf-8') as file:
>>>for i in range(10):
>>>json.dump(next(data),file)>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_json(page_data)>>>if __name__ == '__main__':
>>>main()
4.1.3 存储为pickle文件
- pickle支持直接保存Python的大部分数据类型,但只能在Python中使用。
>>>from pyquery import PyQuery as pq
>>>import requests,pickle>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_pickle(data,filename="douban_top10.pk"):
>>># 保存到pickle文档
>>>with open(filename,'wb') as file:
>>>pickle.dump(data,file)>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_pickle(page_data)>>>if __name__ == '__main__':
>>>main()
4.1.4 存储为shelve文件
- shelve支持所有的Python数据类型,但只能在Python中使用。
- 可以把shelve看成是一个临时的缓存数据库。
- 操作方法类似字典。
>>>from pyquery import PyQuery as pq
>>>import requests,shelve>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_shelve(data,filename="douban_top10.db"):
>>># 保存到pickle文档
>>>with shelve.open(filename) as db:
>>>for i in range(10):
>>>db[f"{i+1}"] = next(data)>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_shelve(page_data)>>>if __name__ == '__main__':
>>>main()
4.1.5 存储为csv文件
- csv文件可以看作是纯文本格式的xls文件。
>>>from pyquery import PyQuery as pq
>>>import requests,csv>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_csv(data,filename="douban_top10.csv"):
>>># 保存到pickle文档
>>>with open(filename,'w',encoding='utf-8') as file:
>>>writer = csv.writer(file)
>>>writer.writerows(data)>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_csv(page_data)>>>if __name__ == '__main__':
>>>main()
4.2 数据库存储 4.2.1 存储到MySQL
- MySQL是目前最流行的关系型数据库之一。
- 通过行和列组成的二维表存储。
- 以本地MySQL server为例。
>>>from pyquery import PyQuery as pq
>>>import pymysql
>>>import requests>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_mysql_db(data,host='localhost',user='root',password='root',port=3306,db='note_sample',table='douban_top10'):
>>># 连接到mysql服务器
>>>db = pymysql.connect(host=host,user=user,password=password,port=port,db=db,charset='utf8')
>>>with db:
>>>cursor = db.cursor()>>># 如果没有表就创建表
>>>sql_create_table = f'CREATE TABLE IF NOT EXISTS {str(table)}(ind_order VARCHAR(255) NOT NULL PRIMARY KEY, name VARCHAR(255) NOT NULL, director VARCHAR(255), actor VARCHAR(255), star VARCHAR(255), link_to VARCHAR(255))'
>>>cursor.execute(sql_create_table)
>>>db.commit()>>>for i in range(10):
>>>next_data = https://www.it610.com/article/next(data)>>>sql_insert_data = https://www.it610.com/article/f'INSERT INTO {table}(ind_order,name,director,actor,star,link_to) VALUES({next_data[0]}, "{next_data[1]}", "{next_data[2]}", "{next_data[3]}", "{next_data[4]}", "{next_data[5]}")'>>>try:
>>>cursor.execute(sql_insert_data)
>>>except Exception as e:
>>>print(e)
>>>db.rollback()
>>>db.commit()>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_mysql_db(page_data)>>>if __name__ == '__main__':
>>>main()
4.2.2 存储到MongoDB
- MongoDB是基于分布式文件存储的开源非关系型数据库。
- 内容存储类似JSON格式。
- 以单机版MongoDB为例。
>>>from pyquery import PyQuery as pq
>>>import pymongo
>>>import requests>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_mongo_db(data,host='localhost',port=27017,db='note_sample',collection='douban_top10'):
>>># 连接到mongodb
>>>client = pymongo.MongoClient(host=host, port=port)
>>>db = client[db] # 创建数据库
>>>collection = db[collection]# 获得集合>>>for i in range(10):
>>>next_data = https://www.it610.com/article/dict((title,value) for title,value in zip(['ind','name','director','actor','stars','link'],next(data)))
>>>collection.insert_one(next_data)>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_mongo_db(page_data)>>>if __name__ == '__main__':
>>>main()
4.2.3 存储到Redis
- Redis是一款基于内存非关系型数据库。
- 基于键值对保存。
>>>from pyquery import PyQuery as pq
>>>from redis import StrictRedis
>>>import requests>>>def sort_data(func):
>>>def deco(*args,**kargs):
>>># 处理内容
>>>data = https://www.it610.com/article/func(*args,**kargs)>>>html_data = https://www.it610.com/article/pq(data)>>>hd = html_data('.hd')
>>>bd = html_data('.bd')>>>index = [x.text() for x in html_data.find('em').items()]
>>>name = [x.text() for x in hd.find('.title:first-child').items()]
>>>director_actor = [x.html().strip().split('
')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>star = bd('.star')('span:nth-child(4)').text().split()
>>>link = [x.attr.href for x in hd('a').items()]>>>result = zip(index,name,director,actor,star,link)
>>>return result
>>>return deco>>>@sort_data
>>>def get_page(url):
>>># 获得页面内容
>>>headers = {
>>>'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>}
>>>res = requests.get(url=url,headers=headers)
>>>if res.status_code == 200:
>>>return res.text # 获得HTML页面
>>>else:
>>>return None>>>def save_to_redis(data,host='localhost',port=6379,db=0,password=None):
>>># 连接到redis
>>>redis = StrictRedis(host=host,port=port,db=db,password=password)>>>for i in range(10):
>>>next_data = https://www.it610.com/article/next(data)>>>redis.set(next_data[0]," ".join(next_data[1:]))>>>def main():
>>># 入口
>>>url = 'https://movie.douban.com/top250'
>>>page_data = https://www.it610.com/article/get_page(url)>>>save_to_redis(page_data)>>>if __name__ == '__main__':
>>>main()
参考资料
- https://blog.csdn.net/u010138758/article/details/80152151 J-Ombudsman
- https://www.cnblogs.com/zhuluqing/p/8832205.html moisiet
- https://www.runoob.com 菜鸟教程
- http://www.tulingxueyuan.com/ 北京图灵学院
- http://www.imooc.com/article/19184?block_id=tuijian_wz#child_5_1 两点水
- https://blog.csdn.net/weixin_44213550/article/details/91346411 python老菜鸟
- https://realpython.com/python-string-formatting/ Dan Bader
- https://www.liaoxuefeng.com/ 廖雪峰
- https://blog.csdn.net/Gnewocean/article/details/85319590 新海说
- https://www.cnblogs.com/Nicholas0707/p/9021672.html Nicholas
- https://www.cnblogs.com/dalaoban/p/9331113.html 超天大圣
- https://blog.csdn.net/zhubao124/article/details/81662775 zhubao124
- https://blog.csdn.net/z59d8m6e40/article/details/72871485 z59d8m6e40
- 《Python学习手册》Mark Lutz
- 《Python编程 从入门到实践》Eric Matthes
- 《Python3网络爬虫开发实战》崔庆才
推荐阅读
- 热闹中的孤独
- JAVA(抽象类与接口的区别&重载与重写&内存泄漏)
- 我要做大厨
- 放屁有这三个特征的,请注意啦!这说明你的身体毒素太多
- 一个人的旅行,三亚
- 布丽吉特,人生绝对的赢家
- 慢慢的美丽
- 尽力
- 一个小故事,我的思考。
- 家乡的那条小河