笔记|微博爬取长津湖博文及评论
微博爬取长津湖博文及评论
# Changjin Lake film data from Weibo
# @Time: 20211006
# @Author: heheyangimport requests
import json
import re
import pprint
import pandas as pddef comments_singlePage_crawl(url,headers,comments_info, id):
"""
评论单页爬取
:param url:
:param headers:
:return:
"""
# 获取html码
html = requests.get(url, headers).text
# json解析html
html_dict = json.loads(html)
comments_data = https://www.it610.com/article/html_dict["data"]["data"]
for comment in comments_data:
comments_info["id"].append(id)
comments_info["date"].append(comment["created_at"])
# 筛选出text中的文本信息
text = re.sub(", "", comment["text"])
text = re.sub("", "", text)
comments_info["text"].append(text)def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info):
"""
单页爬取函数
:param url: 待爬取url
:param headers: 请求头
:param mblog_info: mblog信息存储字典
"""
# 获取html码
html = requests.get(url,headers).text
# json解析html
html_dict = json.loads(html)
users = html_dict["data"]["cards"]
# 博文存储
for user in users:
mblog = user["mblog"]
mblog_info["id"].append(mblog["id"])
mblog_info["date"].append(mblog["created_at"])
# 筛选出text中的文本信息
text = re.sub(","",mblog["text"])
text = re.sub("","",text)
mblog_info["text"].append(text)
# 构造评论的url
comments_url = "https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"])
# 保存评论
i = 0
while True:
try:
comments_url_ = comments_url + str(i)
comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"])
i += 1
except:
break
pprint.pprint(comments_info)def weibo_bowen_data_crawl(url,headers):
"""
博文信息爬取函数
:param url: 待爬取网站url
:param headers: 请求头
:return: 博文信息存储字典mblog_info
"""
# 博文信息存储字典
mblog_info = {"id": [],
"date": [],
"text": []
}
# 评论信息保存字典
comments_info = {"id":[],
"date":[],
"text":[],
}
# 爬取10页博文
for i in range(1,10):
url_ = url + str(i)
# 添加博文信息
weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info)
return mblog_info,comments_infodef bowen_data_store(mblog_info,comments_info):
"""
数据处理并保存到excel中
:param mblog_info: 博文信息
:return: 保存到excel
"""
# 保存表1
data = https://www.it610.com/article/pd.DataFrame(mblog_info)
data["num"] = data.index + 1
data["keyword"] = ["Film Changjin Lake"]*len(data["num"])
df = data.loc[:,["num","keyword","id","date","text"]]
df.to_excel("bowen_data.xlsx",sheet_name="Sheet1")
#保存表2
comments_data = https://www.it610.com/article/pd.DataFrame(comments_info)
comments_data["num"] = comments_data.index + 1
df_c = comments_data.loc[:,["num","id","date","text"]]
df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")if __name__ == '__main__':
# 微博url
url ="https://m.weibo.cn/api/container/getIndex?uid=7377392724&luicode=10000011&lfid=100103type%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=https://www.it610.com/article/7377392724&containerid=1076037377392724&page=" # 长津湖微博
# 请求头
headers = {"cookie":"自行添加",
"user-agent":"自行添加"
}
mblog_info,comments_info = weibo_bowen_data_crawl(url,headers)
bowen_data_store(mblog_info,comments_info)
注意修改请求头信息,结果会存储两个excel文件,一个博文存储文件,一个评论存储文件。
文章图片
【笔记|微博爬取长津湖博文及评论】
文章图片
写了一下午,欢迎交流,需要数据文件的可以私聊。
推荐阅读
- EffectiveObjective-C2.0|EffectiveObjective-C2.0 笔记 - 第二部分
- Android中的AES加密-下
- 【读书笔记】贝叶斯原理
- 【韩语学习】(韩语随堂笔记整理)
- 使用协程爬取网页,计算网页数据大小
- 人性的弱点-笔记
- 读书笔记:博登海默法理学|读书笔记:博登海默法理学 —— 正义的探索(1)
- D034+3组苏曼+《写作这回事》读书笔记
- 《自我的追寻》读书笔记3
- 最有效的时间管理工具(赢效率手册和总结笔记)