如何使用python爬取知乎热榜Top50数据
目录
- 1、导入第三方库
- 2、程序的主函数
- 3、正则表达式匹配数据
- 4、程序运行结果
- 5、程序源代码
1、导入第三方库
import urllib.request,urllib.error#请求网页from bs4 import BeautifulSoup# 解析数据import sqlite3# 导入数据库import re # 正则表达式import time # 获取当前时间
2、程序的主函数
def main():# 声明爬取网页baseurl = "https://www.zhihu.com/hot"# 爬取网页datalist = getData(baseurl)#保存数据dbname = time.strftime("%Y-%m-%d", time.localtime()) # dbpath = "zhihuTop50" + dbnamesaveData(datalist,dbpath)
3、正则表达式匹配数据
#正则表达式findlink = re.compile(r'(.*?)') #问题排名findtitle = re.compile(r'(.*?)') #问题标题findintroduce = re.compile(r'(.*?)') #简要介绍findscore = re.compile(r'(.*?)') #热门评分findimg = re.compile(r'
文章图片
') #文章配图
4、程序运行结果
文章图片
文章图片
5、程序源代码
import urllib.request,urllib.errorfrom bs4 import BeautifulSoupimport sqlite3import reimport timedef main():# 声明爬取网页baseurl = "https://www.zhihu.com/hot"# 爬取网页datalist = getData(baseurl)#保存数据dbname = time.strftime("%Y-%m-%d", time.localtime())dbpath = "zhihuTop50" + dbnamesaveData(datalist,dbpath)print()#正则表达式findlink = re.compile(r'(.*?)') #问题排名findtitle = re.compile(r'(.*?)') #问题标题findintroduce = re.compile(r'(.*?)') #简要介绍findscore = re.compile(r'(.*?)') #热门评分findimg = re.compile(r'
文章图片
') #文章配图def getData(baseurl):datalist = []html = askURL(baseurl)# print(html)soup = BeautifulSoup(html,'html.parser')for item in soup.find_all('a',class_="css-hi1lih"):# print(item)data = https://www.it610.com/article/[]item = str(item)Id = re.findall(findid,item)if(len(Id) == 0):Id = re.findall(r'(.*?)',item)[0]else: Id = Id[0]data.append(Id)# print(Id)Link = re.findall(findlink,item)[0]data.append(Link)# print(Link)Title = re.findall(findtitle,item)[0]data.append(Title)# print(Title)Introduce = re.findall(findintroduce,item)if(len(Introduce) == 0):Introduce = " "else:Introduce = Introduce[0]data.append(Introduce)# print(Introduce)Score = re.findall(findscore,item)[0]data.append(Score)# print(Score)Img = re.findall(findimg,item)if (len(Img) == 0):Img = " "else: Img = Img[0]data.append(Img)# print(Img)datalist.append(data)return datalistdef askURL(baseurl):# 设置请求头head = {# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/80.0.3987.163Safari/537.36""User-Agent": "Mozilla / 5.0(iPhone; CPUiPhoneOS13_2_3likeMacOSX) AppleWebKit / 605.1.15(KHTML, likeGecko) Version / 13.0.3Mobile / 15E148Safari / 604.1"}request = urllib.request.Request(baseurl, headers=head)html = ""try:response = urllib.request.urlopen(request)html = response.read().decode("utf-8")# print(html)except urllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return htmlprint()def saveData(datalist,dbpath):init_db(dbpath)conn = sqlite3.connect(dbpath)cur = conn.cursor()for data in datalist:sql = '''insert into Top50(id,info_link,title,introduce,score,img)values("%s","%s","%s","%s","%s","%s")'''%(data[0],data[1],data[2],data[3],data[4],data[5])print(sql)cur.execute(sql)conn.commit()cur.close()conn.close()def init_db(dbpath):sql = '''create table Top50(id integer primary key autoincrement,info_link text,title text,introduce text,score text,img text)'''conn = sqlite3.connect(dbpath)cursor = conn.cursor()cursor.execute(sql)conn.commit()conn.close()if __name__ =="__main__":main()
【如何使用python爬取知乎热榜Top50数据】到此这篇关于如何使用python爬取知乎热榜Top50数据的文章就介绍到这了,更多相关python 爬取知乎内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!
推荐阅读
- 考研英语阅读终极解决方案——阅读理解如何巧拿高分
- 由浅入深理解AOP
- 如何寻找情感问答App的分析切入点
- 【译】20个更有效地使用谷歌搜索的技巧
- mybatisplus如何在xml的连表查询中使用queryWrapper
- MybatisPlus|MybatisPlus LambdaQueryWrapper使用int默认值的坑及解决
- MybatisPlus使用queryWrapper如何实现复杂查询
- python学习之|python学习之 实现QQ自动发送消息
- 逻辑回归的理解与python示例
- python自定义封装带颜色的logging模块