如何使用python爬取知乎热榜Top50数据

目录

  • 1、导入第三方库
  • 2、程序的主函数
  • 3、正则表达式匹配数据
  • 4、程序运行结果
  • 5、程序源代码

1、导入第三方库
import urllib.request,urllib.error#请求网页from bs4 import BeautifulSoup# 解析数据import sqlite3# 导入数据库import re # 正则表达式import time # 获取当前时间


2、程序的主函数
def main():# 声明爬取网页baseurl = "https://www.zhihu.com/hot"# 爬取网页datalist = getData(baseurl)#保存数据dbname = time.strftime("%Y-%m-%d", time.localtime()) # dbpath = "zhihuTop50" + dbnamesaveData(datalist,dbpath)


3、正则表达式匹配数据
#正则表达式findlink = re.compile(r'(.*?)') #问题排名findtitle = re.compile(r'(.*?)') #问题标题findintroduce = re.compile(r'(.*?)') #简要介绍findscore = re.compile(r'(.*?)') #热门评分findimg = re.compile(r'如何使用python爬取知乎热榜Top50数据
文章图片
') #文章配图


4、程序运行结果 如何使用python爬取知乎热榜Top50数据
文章图片

如何使用python爬取知乎热榜Top50数据
文章图片


5、程序源代码
import urllib.request,urllib.errorfrom bs4 import BeautifulSoupimport sqlite3import reimport timedef main():# 声明爬取网页baseurl = "https://www.zhihu.com/hot"# 爬取网页datalist = getData(baseurl)#保存数据dbname = time.strftime("%Y-%m-%d", time.localtime())dbpath = "zhihuTop50" + dbnamesaveData(datalist,dbpath)print()#正则表达式findlink = re.compile(r'(.*?)') #问题排名findtitle = re.compile(r'(.*?)') #问题标题findintroduce = re.compile(r'(.*?)') #简要介绍findscore = re.compile(r'(.*?)') #热门评分findimg = re.compile(r'如何使用python爬取知乎热榜Top50数据
文章图片
') #文章配图def getData(baseurl):datalist = []html = askURL(baseurl)# print(html)soup = BeautifulSoup(html,'html.parser')for item in soup.find_all('a',class_="css-hi1lih"):# print(item)data = https://www.it610.com/article/[]item = str(item)Id = re.findall(findid,item)if(len(Id) == 0):Id = re.findall(r'(.*?)',item)[0]else: Id = Id[0]data.append(Id)# print(Id)Link = re.findall(findlink,item)[0]data.append(Link)# print(Link)Title = re.findall(findtitle,item)[0]data.append(Title)# print(Title)Introduce = re.findall(findintroduce,item)if(len(Introduce) == 0):Introduce = " "else:Introduce = Introduce[0]data.append(Introduce)# print(Introduce)Score = re.findall(findscore,item)[0]data.append(Score)# print(Score)Img = re.findall(findimg,item)if (len(Img) == 0):Img = " "else: Img = Img[0]data.append(Img)# print(Img)datalist.append(data)return datalistdef askURL(baseurl):# 设置请求头head = {# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/80.0.3987.163Safari/537.36""User-Agent": "Mozilla / 5.0(iPhone; CPUiPhoneOS13_2_3likeMacOSX) AppleWebKit / 605.1.15(KHTML, likeGecko) Version / 13.0.3Mobile / 15E148Safari / 604.1"}request = urllib.request.Request(baseurl, headers=head)html = ""try:response = urllib.request.urlopen(request)html = response.read().decode("utf-8")# print(html)except urllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return htmlprint()def saveData(datalist,dbpath):init_db(dbpath)conn = sqlite3.connect(dbpath)cur = conn.cursor()for data in datalist:sql = '''insert into Top50(id,info_link,title,introduce,score,img)values("%s","%s","%s","%s","%s","%s")'''%(data[0],data[1],data[2],data[3],data[4],data[5])print(sql)cur.execute(sql)conn.commit()cur.close()conn.close()def init_db(dbpath):sql = '''create table Top50(id integer primary key autoincrement,info_link text,title text,introduce text,score text,img text)'''conn = sqlite3.connect(dbpath)cursor = conn.cursor()cursor.execute(sql)conn.commit()conn.close()if __name__ =="__main__":main()

【如何使用python爬取知乎热榜Top50数据】到此这篇关于如何使用python爬取知乎热榜Top50数据的文章就介绍到这了,更多相关python 爬取知乎内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!

    推荐阅读