笔记|微博爬取长津湖博文及评论 python|爬虫

微博爬取长津湖博文及评论

# Changjin Lake film data from Weibo # @Time: 20211006 # @Author: heheyangimport requests import json import re import pprint import pandas as pddef comments_singlePage_crawl(url,headers,comments_info, id): """ 评论单页爬取 :param url: :param headers: :return: """ # 获取html码 html = requests.get(url, headers).text # json解析html html_dict = json.loads(html) comments_data = https://www.it610.com/article/html_dict["data"]["data"] for comment in comments_data: comments_info["id"].append(id) comments_info["date"].append(comment["created_at"]) # 筛选出text中的文本信息 text = re.sub(", "", comment["text"]) text = re.sub("", "", text) comments_info["text"].append(text)def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info): """ 单页爬取函数 :param url: 待爬取url :param headers: 请求头 :param mblog_info: mblog信息存储字典 """ # 获取html码 html = requests.get(url,headers).text # json解析html html_dict = json.loads(html) users = html_dict["data"]["cards"] # 博文存储 for user in users: mblog = user["mblog"] mblog_info["id"].append(mblog["id"]) mblog_info["date"].append(mblog["created_at"]) # 筛选出text中的文本信息 text = re.sub(","",mblog["text"]) text = re.sub("","",text) mblog_info["text"].append(text) # 构造评论的url comments_url = "https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"]) # 保存评论 i = 0 while True: try: comments_url_ = comments_url + str(i) comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"]) i += 1 except: break pprint.pprint(comments_info)def weibo_bowen_data_crawl(url,headers): """ 博文信息爬取函数 :param url: 待爬取网站url :param headers: 请求头 :return: 博文信息存储字典mblog_info """ # 博文信息存储字典 mblog_info = {"id": [], "date": [], "text": [] } # 评论信息保存字典 comments_info = {"id":[], "date":[], "text":[], } # 爬取10页博文 for i in range(1,10): url_ = url + str(i) # 添加博文信息 weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info) return mblog_info,comments_infodef bowen_data_store(mblog_info,comments_info): """ 数据处理并保存到excel中 :param mblog_info: 博文信息 :return: 保存到excel """ # 保存表1 data = https://www.it610.com/article/pd.DataFrame(mblog_info) data["num"] = data.index + 1 data["keyword"] = ["Film Changjin Lake"]*len(data["num"]) df = data.loc[:,["num","keyword","id","date","text"]] df.to_excel("bowen_data.xlsx",sheet_name="Sheet1") #保存表2 comments_data = https://www.it610.com/article/pd.DataFrame(comments_info) comments_data["num"] = comments_data.index + 1 df_c = comments_data.loc[:,["num","id","date","text"]] df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")if __name__ == '__main__': # 微博url url ="https://m.weibo.cn/api/container/getIndex?uid=7377392724&luicode=10000011&lfid=100103type%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=https://www.it610.com/article/7377392724&containerid=1076037377392724&page=" # 长津湖微博 # 请求头 headers = {"cookie":"自行添加", "user-agent":"自行添加" } mblog_info,comments_info = weibo_bowen_data_crawl(url,headers) bowen_data_store(mblog_info,comments_info)

注意修改请求头信息，结果会存储两个excel文件，一个博文存储文件，一个评论存储文件。