爬取网易云音乐

效果图

spider_music.py主页面

# coding=gbk from download import Download from url_manager import Url_Manager from html_parser import Html_Parser from save import Save from set_text_color import Set_Colorclass Spider_Music():def __init__(self): self.download = Download() self.url_manager = Url_Manager() self.html_parser = Html_Parser() self.save = Save() self.set_color = Set_Color()def craw(self,url): self.url_manager.addurl({'url':url,'name':'temp'})while self.url_manager.checknewurllength>0: newurl = self.url_manager.geturl()if self.save.checkfile(newurl['name']): self.set_color.printDarkRed("{} 已下载!\n".format(newurl['name'])) continueprint("开始下载 {} {}".format(newurl['name'],newurl['url'])) htmlcontent = self.download.download(newurl['url'])if htmlcontent['htmlcontents'] == None: self.url_manager.delUrl(newurl) self.url_manager.addurl(newurl)newurls,result = self.html_parser.parser(htmlcontent)self.url_manager.addurls(newurls) self.save.save(result,newurl['name']) print("下载完成 {} ".format(newurl['name'])) print("共下载{}首歌曲".format(self.save.count))def main(self): self.craw('https://music.163.com/#/playlist?id=2492536378')spider = Spider_Music() spider.main()

download.py负责下载
# coding=gbk import re import requests from selenium import webdriver import random from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Optionsclass Download():__uas = [ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4", "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" ]__ips = []headers = { #"Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,image/apng,*/*; q=0.8", 'Referer':'http://music.163.com/', 'Host':'music.163.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh; q=0.9" }def __init__(self): self.url = '' #self.__ips = self.get_ip()def download(self,url): self.url = url #print(url) return self.patterns@property def patterns(self): playlist = re.compile("playlist\?id=\d+")#匹配歌单 song = re.compile("song/media/outer/url\?id=\d+")#匹配下载地址res = { 'identify':False, 'htmlcontents':'', }if re.search(song,self.url): res['identify'] = 1 res['htmlcontents'] = self.getmusic()#用于获取mp3 elif re.search(playlist,self.url): res['identify'] = 2 res['htmlcontents'] = self.geturl() #获取网页内容return res#未找到高可用代理,功能暂时停止 def get_ip(self): url = "https://www.kuaidaili.com/free/inha/1/" res = requests.get(url) soup = BeautifulSoup(res.text,'html.parser') data = https://www.it610.com/article/soup.find(id="list").find('tbody').find_all('tr') ip_compile= re.compile(r'(\d+\.\d+\.\d+\.\d+)')# 匹配IP port_compile = re.compile(r'(\d+)')# 匹配端口 ip = re.findall(ip_compile,str(data))# 获取所有IP port = re.findall(port_compile,str(data))# 获取所有端口 return [":".join(i) for i in zip(ip,port)]# 组合IP+端口,如:115.112.88.23:8080def getmusic(self): try: url = self.getrealurl() host = url.strip('http://').split('/')[0] headers = { "Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,image/apng,*/*; q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh; q=0.9", "Host": host, "User-Agent": self.__uas[random.randint(0,6)]#模拟不同浏览器 } ip = random.choice(self.__ips) proxies = { 'http':'http://'+ip, 'https':'http://'+ip } res = requests.get(url,headers=headers) except Exception as e: print(e) return else: return res.contentdef getrealurl(self): res = requests.get(self.url,headers=self.headers) return res.urldef geturl(self): try: chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') brower = webdriver.Chrome("D:\\tools\\chromedriver_win32\\chromedriver.exe",options=chrome_options)#创建driver,参数为插件的路径 brower.get(self.url) brower.switch_to.frame(brower.find_element_by_name("contentFrame"))#切换到指定框架 except Exception as e: print(e) return else: return brower.page_source#d=Download() #print(d.get_ip()) #res = d.download('http://music.163.com/song/media/outer/url?id=28160459.mp3') #print(res['identify']) #print(res['htmlcontents'])

html_parser.py负责网页内容解析
from bs4 import BeautifulSoupclass Html_Parser(): baseurl = "http://music.163.com/song/media/outer/url?{}.mp3"def parser(self,res):if res.get('identify') == 1: #print(res['identify']) return None,res.get('htmlcontents',False)else: return self.geturls(res['htmlcontents'])def geturls(self,htmlcontent): #print(htmlcontent) newsurl=list() try: soup = BeautifulSoup(htmlcontent,'html.parser') songlist = soup.find('table').find_all('tr')[1:]for link in songlist: url = self.baseurl.format(link.find_all('td')[1].find('a')['href'].split('?')[-1]) name = link.find_all('td')[1].find('a').find('b')['title'] newsurl.append({'url':url,'name':name}) except Exception as e: print(e) pass else: return newsurl,False

url_manager.pyurl管理器
# coding=gbk class Url_Manager():__newurl = list()#存放未下载的url __oldurl = list()#存放已下载的urldef addurl(self,url): if url == None: return if self.checkurl(url): self.__newurl.append(url)def addurls(self,urls): if urls == None: returnfor url in urls: self.addurl(url)def geturl(self): newurl = self.__newurl.pop() self.__oldurl.append(newurl) return newurldef delUrl(self,url): if url in self.__oldurl: self.__oldurl.remove(url)@property def checknewurllength(self): return len(self.__newurl)def checkurl(self,url): if url not in self.__newurl and url not in self.__oldurl: return True else: return False

save.py保存下载内容
# coding=gbk import osclass Save(): path="./download/" count = 0def __init__(self): self.mkdir(self.path)def save(self,contents,name): if contents and name: try: with open(self.remove_special_characters(name),'wb') as f: f.write(contents) except Exception as e: print(e) pass else: self.count+=1 #创建文件存放目录 def mkdir(self,path): if os.path.exists(path): return os.makedirs(path)#防止重复下载 def checkfile(self,name): if name == 'temp': return return os.path.exists(self.remove_special_characters(name))#确保windows下文件可创建成功 def remove_special_characters(self,string): #windows文件名中不能有下列符号:'\\', '/', ':', '*', '?', '"', '<', '>', '|' special_characters = ['\\', '/', ':', '*', '?', '"', '<', '>', '|'] for special_character in special_characters: string = string.replace(special_character,'') return '/'.join([self.path.strip('/'),string.strip()])+".mp3"

【爬取网易云音乐】set_text_color.py设置cmd窗口显示颜色
# coding=gbk #参考地址https://blog.csdn.net/wy_97/article/details/79663014import ctypes,sys class Set_Color(): STD_INPUT_HANDLE = -10 STD_OUTPUT_HANDLE = -11 STD_ERROR_HANDLE = -12# 字体颜色定义 ,关键在于颜色编码,由2位十六进制组成,分别取0~f,前一位指的是背景色,后一位指的是字体色 #由于该函数的限制,应该是只有这16种,可以前景色与背景色组合。也可以几种颜色通过或运算组合,组合后还是在这16种颜色中# Windows CMD命令行 字体颜色定义 text colors FOREGROUND_BLACK = 0x00 # black. FOREGROUND_DARKBLUE = 0x01 # dark blue. FOREGROUND_DARKGREEN = 0x02 # dark green. FOREGROUND_DARKSKYBLUE = 0x03 # dark skyblue. FOREGROUND_DARKRED = 0x04 # dark red. FOREGROUND_DARKPINK = 0x05 # dark pink. FOREGROUND_DARKYELLOW = 0x06 # dark yellow. FOREGROUND_DARKWHITE = 0x07 # dark white. FOREGROUND_DARKGRAY = 0x08 # dark gray. FOREGROUND_BLUE = 0x09 # blue. FOREGROUND_GREEN = 0x0a # green. FOREGROUND_SKYBLUE = 0x0b # skyblue. FOREGROUND_RED = 0x0c # red. FOREGROUND_PINK = 0x0d # pink. FOREGROUND_YELLOW = 0x0e # yellow. FOREGROUND_WHITE = 0x0f # white.# Windows CMD命令行 背景颜色定义 background colors BACKGROUND_BLUE = 0x10 # dark blue. BACKGROUND_GREEN = 0x20 # dark green. BACKGROUND_DARKSKYBLUE = 0x30 # dark skyblue. BACKGROUND_DARKRED = 0x40 # dark red. BACKGROUND_DARKPINK = 0x50 # dark pink. BACKGROUND_DARKYELLOW = 0x60 # dark yellow. BACKGROUND_DARKWHITE = 0x70 # dark white. BACKGROUND_DARKGRAY = 0x80 # dark gray. BACKGROUND_BLUE = 0x90 # blue. BACKGROUND_GREEN = 0xa0 # green. BACKGROUND_SKYBLUE = 0xb0 # skyblue. BACKGROUND_RED = 0xc0 # red. BACKGROUND_PINK = 0xd0 # pink. BACKGROUND_YELLOW = 0xe0 # yellow. BACKGROUND_WHITE = 0xf0 # white.std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE) # get handledef set_cmd_text_color(self,color, handle=False): if handle: Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color) else: Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(self.std_out_handle, color) return Bool#reset white def resetColor(self): self.set_cmd_text_color(self.FOREGROUND_GREEN)#reset white def resetDefault(self): self.set_cmd_text_color(self.FOREGROUND_RED | self.FOREGROUND_GREEN | self.FOREGROUND_BLUE)################################################################暗蓝色 #dark blue def printDarkBlue(self,mess): self.set_cmd_text_color(self.FOREGROUND_DARKBLUE) sys.stdout.write(mess) self.resetColor()#暗绿色 #dark green def printDarkGreen(self,mess): self.set_cmd_text_color(self.FOREGROUND_DARKGREEN) sys.stdout.write(mess) self.resetColor()#暗天蓝色 #dark sky blue def printDarkSkyBlue(mess): self.set_cmd_text_color(self.FOREGROUND_DARKSKYBLUE) sys.stdout.write(mess) self.resetColor()#暗红色 #dark red def printDarkRed(self,mess): #self.set_back() self.set_cmd_text_color(self.FOREGROUND_DARKRED) sys.stdout.write(mess) self.resetColor()#暗粉红色 #dark pink def printDarkPink(self,mess): self.set_cmd_text_color(self.FOREGROUND_DARKPINK) sys.stdout.write(mess) self.resetColor()#暗黄色 #dark yellow def printDarkYellow(self,mess): self.set_cmd_text_color(self.FOREGROUND_DARKYELLOW) sys.stdout.write(mess) self.resetColor()#暗白色 #dark white def printDarkWhite(self,mess): self.set_cmd_text_color(self.FOREGROUND_DARKWHITE) sys.stdout.write(mess) self.resetColor()#暗灰色 #dark gray def printDarkGray(self,mess): self.set_cmd_text_color(self.FOREGROUND_DARKGRAY) sys.stdout.write(mess) self.resetColor()#蓝色 #blue def printBlue(self,mess): self.set_cmd_text_color(self.FOREGROUND_BLUE) sys.stdout.write(mess) self.resetColor()#绿色 #green def printGreen(self,mess): self.set_cmd_text_color(self.FOREGROUND_GREEN) sys.stdout.write(mess) self.resetColor()#天蓝色 #sky blue def printSkyBlue(self,mess): self.set_cmd_text_color(self.FOREGROUND_SKYBLUE) sys.stdout.write(mess) self.resetColor()#红色 #red def printRed(self,mess): self.set_cmd_text_color(self.FOREGROUND_RED) sys.stdout.write(mess) self.resetColor()#粉红色 #pink def printPink(self,mess): self.set_cmd_text_color(self.FOREGROUND_PINK) sys.stdout.write(mess) self.resetColor()#黄色 #yellow def printYellow(self,mess): self.set_cmd_text_color(self.FOREGROUND_YELLOW) sys.stdout.write(mess) self.resetColor()#白色 #white def printWhite(self,mess): self.set_cmd_text_color(self.FOREGROUND_WHITE) sys.stdout.write(mess) self.resetColor()###################################################白底黑字 #white bkground and black text def printWhiteBlack(self,mess): self.set_cmd_text_color(self.FOREGROUND_BLACK | self.BACKGROUND_WHITE) sys.stdout.write(mess) self.resetColor()#白底黑字 #white bkground and black text def printWhiteBlack_2(self,mess): self.set_cmd_text_color(0xf0) sys.stdout.write(mess) self.resetColor()#黄底蓝字 #white bkground and black text def printYellowRed(self,mess): self.set_cmd_text_color(BACKGROUND_YELLOW | FOREGROUND_RED) sys.stdout.write(mess) self.resetColor()############################################################## """ if __name__ == '__main__':print printDarkBlue('printDarkBlue:暗蓝色文字\n') printDarkGreen('printDarkGreen:暗绿色文字\n') printDarkSkyBlue(u'printDarkSkyBlue:暗天蓝色文字\n') printDarkRed(u'printDarkRed:暗红色文字\n') printDarkPink(u'printDarkPink:暗粉红色文字\n') printDarkYellow(u'printDarkYellow:暗黄色文字\n') printDarkWhite(u'printDarkWhite:暗白色文字\n') printDarkGray(u'printDarkGray:暗灰色文字\n') printBlue(u'printBlue:蓝色文字\n') printGreen(u'printGreen:绿色文字\n') printSkyBlue(u'printSkyBlue:天蓝色文字\n') printRed(u'printRed:红色文字\n') printPink(u'printPink:粉红色文字\n') printYellow(u'printYellow:黄色文字\n') printWhite(u'printWhite:白色文字\n') printWhiteBlack(u'printWhiteBlack:白底黑字输出\n') printWhiteBlack_2(u'printWhiteBlack_2:白底黑字输出\n') printYellowRed('printYellowRed:黄底红字输出\n') """ #c = Set_Color() #c.printDarkRed(u'printDarkRed:暗红色文字\n')

    推荐阅读